Message ID | 1461185716-7332-1-git-send-email-gftg@linux.vnet.ibm.com |
---|---|
State | New |
Headers | show |
I am ok with this patch, the only nit is it now calls memset for all padding size, where for small padding (less than 16 or maybe 32) I think it would be still faster than calling memset (since dcbtst will be used only for large inputs). However I still not sure if embedding the memset write_LT_32 would yield much gain as well. On 20-04-2016 17:55, Gabriel F. T. Gomes wrote: > Call __memset_power8 to pad, with zeros, the remaining bytes in the > dest string on __strncpy_power8 and __stpncpy_power8. This improves > performance when n is larger than the input string, giving ~30% gain for > larger strings without impacting much shorter strings. > > 2016-04-18 Gabriel F. T. Gomes <gftg@linux.vnet.ibm.com> > > * sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the > remaining bytes in the dest string, with zeros. > --- > sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++---------------- > 1 file changed, 56 insertions(+), 67 deletions(-) > > diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S > index 17c3afb..0bb3bd4 100644 > --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S > +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S > @@ -24,6 +24,8 @@ > # define FUNC_NAME strncpy > #endif > > +#define FRAMESIZE (FRAME_MIN_SIZE+48) > + > /* Implements the function > > char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) > @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0) > addi r10,r4,16 > rlwinm r9,r4,0,19,19 > > - /* Since it is a leaf function, save some non-volatile registers on the > - protected/red zone. */ > + /* Save some non-volatile registers on the stack. */ > std r26,-48(r1) > std r27,-40(r1) > > @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0) > std r30,-16(r1) > std r31,-8(r1) > > + /* Update CFI. */ > + cfi_offset(r26, -48) > + cfi_offset(r27, -40) > + cfi_offset(r28, -32) > + cfi_offset(r29, -24) > + cfi_offset(r30, -16) > + cfi_offset(r31, -8) > + > beq cr7,L(unaligned_lt_16) > rldicl r9,r4,0,61 > subfic r8,r9,8 > @@ -180,74 +189,58 @@ L(short_path_loop_end): > ld r31,-8(r1) > blr > > - /* This code pads the remainder dest with NULL bytes. The algorithm > - calculate the remanining size and issues a doubleword unrolled > - loops followed by a byte a byte set. */ > + /* This code pads the remainder of dest with NULL bytes. The algorithm > + calculates the remaining size and calls memset. */ > .align 4 > L(zero_pad_start): > mr r5,r10 > mr r9,r6 > L(zero_pad_start_1): > - srdi. r8,r5,r3 > - mr r10,r9 > -#ifdef USE_AS_STPNCPY > - mr r3,r9 > + /* At this point: > + - r5 holds the number of bytes that still have to be written to > + dest. > + - r9 points to the position, in dest, where the first null byte > + will be written. > + The above statements are true both when control reaches this label > + from a branch or when falling through the previous lines. */ > +#ifndef USE_AS_STPNCPY > + mr r30,r3 /* Save the return value of strncpy. */ > +#endif > + /* Prepare the call to memset. */ > + mr r3,r9 /* Pointer to the area to be zero-filled. */ > + li r4,0 /* Byte to be written (zero). */ > + > + /* We delayed the creation of the stack frame, as well as the saving of > + the link register, because only at this point, we are sure that > + doing so is actually needed. */ > + > + /* Save the link register. */ > + mflr r0 > + std r0,16(r1) > + cfi_offset(lr, 16) > + > + /* Create the stack frame. */ > + stdu r1,-FRAMESIZE(r1) > + cfi_adjust_cfa_offset(FRAMESIZE) > + > + bl __memset_power8 > + nop > + > + /* Restore the stack frame. */ > + addi r1,r1,FRAMESIZE > + cfi_adjust_cfa_offset(-FRAMESIZE) > + /* Restore the link register. */ > + ld r0,16(r1) > + mtlr r0 > + > +#ifndef USE_AS_STPNCPY > + mr r3,r30 /* Restore the return value of strncpy, i.e.: > + dest. For stpncpy, the return value is the > + same as return value of memset. */ > #endif > - beq- cr0,L(zero_pad_loop_b_start) > - cmpldi cr7,r8,1 > - li cr7,0 > - std r7,0(r9) > - beq cr7,L(zero_pad_loop_b_prepare) > - addic. r8,r8,-2 > - addi r10,r9,r16 > - std r7,8(r9) > - beq cr0,L(zero_pad_loop_dw_2) > - std r7,16(r9) > - li r9,0 > - b L(zero_pad_loop_dw_1) > - > - .align 4 > -L(zero_pad_loop_dw): > - addi r10,r10,16 > - std r9,-8(r10) > - beq cr0,L(zero_pad_loop_dw_2) > - std r9,0(r10) > -L(zero_pad_loop_dw_1): > - cmpldi cr7,r8,1 > - std r9,0(r10) > - addic. r8,r8,-2 > - bne cr7,L(zero_pad_loop_dw) > - addi r10,r10,8 > -L(zero_pad_loop_dw_2): > - rldicl r5,r5,0,61 > -L(zero_pad_loop_b_start): > - cmpdi cr7,r5,0 > - addi r5,r5,-1 > - addi r9,r10,-1 > - add r10,r10,5 > - subf r10,r9,r10 > - li r8,0 > - beq- cr7,L(short_path_loop_end) > - > - /* Write remaining 1-8 bytes. */ > - .align 4 > - addi r9,r9,1 > - mtocrf 0x1,r10 > - bf 29,4f > - stw r8,0(r9) > - addi r9,r9,4 > - > - .align 4 > -4: bf 30,2f > - sth r8,0(r9) > - addi r9,r9,2 > - > - .align 4 > -2: bf 31,1f > - stb r8,0(r9) > > - /* Restore non-volatile registers. */ > -1: ld r26,-48(r1) > + /* Restore non-volatile registers and return. */ > + ld r26,-48(r1) > ld r27,-40(r1) > ld r28,-32(r1) > ld r29,-24(r1) > @@ -443,10 +436,6 @@ L(short_path_prepare_2_3): > mr r4,r28 > mr r9,r29 > b L(short_path_2) > -L(zero_pad_loop_b_prepare): > - addi r10,r9,8 > - rldicl r5,r5,0,61 > - b L(zero_pad_loop_b_start) > L(zero_pad_start_prepare_1): > mr r5,r6 > mr r9,r8 >
On Wed, 20 Apr 2016 18:25:37 -0300 Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote: > I am ok with this patch, the only nit is it now calls memset for all > padding size, where for small padding (less than 16 or maybe 32) I think it would > be still faster than calling memset (since dcbtst will be used only for > large inputs). However I still not sure if embedding the memset write_LT_32 > would yield much gain as well. Hi, Adhemerval. Thanks for your review. I also expected that for small amounts of padding, calling memset would harm performance. However, I ran some tests, where memset was only called for padding amounts greater than 32 bytes, and I didn't notice any significant changes in performance. That's the reason why I sent this version of the patch, which always calls memset. I also noticed that bench-strncpy exposes a load-hit-store problem, which might be suppressing the difference for small paddings. I have plans to fix this in the future, and, if necessary, limit memset to greater paddings. However, right now, I don't see a benefit in adding the check. > On 20-04-2016 17:55, Gabriel F. T. Gomes wrote: > > Call __memset_power8 to pad, with zeros, the remaining bytes in the > > dest string on __strncpy_power8 and __stpncpy_power8. This improves > > performance when n is larger than the input string, giving ~30% gain for > > larger strings without impacting much shorter strings. > > > > 2016-04-18 Gabriel F. T. Gomes <gftg@linux.vnet.ibm.com> > > > > * sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the > > remaining bytes in the dest string, with zeros. > > --- > > sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++---------------- > > 1 file changed, 56 insertions(+), 67 deletions(-) > > > > diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S > > index 17c3afb..0bb3bd4 100644 > > --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S > > +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S > > @@ -24,6 +24,8 @@ > > # define FUNC_NAME strncpy > > #endif > > > > +#define FRAMESIZE (FRAME_MIN_SIZE+48) > > + > > /* Implements the function > > > > char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) > > @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0) > > addi r10,r4,16 > > rlwinm r9,r4,0,19,19 > > > > - /* Since it is a leaf function, save some non-volatile registers on the > > - protected/red zone. */ > > + /* Save some non-volatile registers on the stack. */ > > std r26,-48(r1) > > std r27,-40(r1) > > > > @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0) > > std r30,-16(r1) > > std r31,-8(r1) > > > > + /* Update CFI. */ > > + cfi_offset(r26, -48) > > + cfi_offset(r27, -40) > > + cfi_offset(r28, -32) > > + cfi_offset(r29, -24) > > + cfi_offset(r30, -16) > > + cfi_offset(r31, -8) > > + > > beq cr7,L(unaligned_lt_16) > > rldicl r9,r4,0,61 > > subfic r8,r9,8 > > @@ -180,74 +189,58 @@ L(short_path_loop_end): > > ld r31,-8(r1) > > blr > > > > - /* This code pads the remainder dest with NULL bytes. The algorithm > > - calculate the remanining size and issues a doubleword unrolled > > - loops followed by a byte a byte set. */ > > + /* This code pads the remainder of dest with NULL bytes. The algorithm > > + calculates the remaining size and calls memset. */ > > .align 4 > > L(zero_pad_start): > > mr r5,r10 > > mr r9,r6 > > L(zero_pad_start_1): > > - srdi. r8,r5,r3 > > - mr r10,r9 > > -#ifdef USE_AS_STPNCPY > > - mr r3,r9 > > + /* At this point: > > + - r5 holds the number of bytes that still have to be written to > > + dest. > > + - r9 points to the position, in dest, where the first null byte > > + will be written. > > + The above statements are true both when control reaches this label > > + from a branch or when falling through the previous lines. */ > > +#ifndef USE_AS_STPNCPY > > + mr r30,r3 /* Save the return value of strncpy. */ > > +#endif > > + /* Prepare the call to memset. */ > > + mr r3,r9 /* Pointer to the area to be zero-filled. */ > > + li r4,0 /* Byte to be written (zero). */ > > + > > + /* We delayed the creation of the stack frame, as well as the saving of > > + the link register, because only at this point, we are sure that > > + doing so is actually needed. */ > > + > > + /* Save the link register. */ > > + mflr r0 > > + std r0,16(r1) > > + cfi_offset(lr, 16) > > + > > + /* Create the stack frame. */ > > + stdu r1,-FRAMESIZE(r1) > > + cfi_adjust_cfa_offset(FRAMESIZE) > > + > > + bl __memset_power8 > > + nop > > + > > + /* Restore the stack frame. */ > > + addi r1,r1,FRAMESIZE > > + cfi_adjust_cfa_offset(-FRAMESIZE) > > + /* Restore the link register. */ > > + ld r0,16(r1) > > + mtlr r0 > > + > > +#ifndef USE_AS_STPNCPY > > + mr r3,r30 /* Restore the return value of strncpy, i.e.: > > + dest. For stpncpy, the return value is the > > + same as return value of memset. */ > > #endif > > - beq- cr0,L(zero_pad_loop_b_start) > > - cmpldi cr7,r8,1 > > - li cr7,0 > > - std r7,0(r9) > > - beq cr7,L(zero_pad_loop_b_prepare) > > - addic. r8,r8,-2 > > - addi r10,r9,r16 > > - std r7,8(r9) > > - beq cr0,L(zero_pad_loop_dw_2) > > - std r7,16(r9) > > - li r9,0 > > - b L(zero_pad_loop_dw_1) > > - > > - .align 4 > > -L(zero_pad_loop_dw): > > - addi r10,r10,16 > > - std r9,-8(r10) > > - beq cr0,L(zero_pad_loop_dw_2) > > - std r9,0(r10) > > -L(zero_pad_loop_dw_1): > > - cmpldi cr7,r8,1 > > - std r9,0(r10) > > - addic. r8,r8,-2 > > - bne cr7,L(zero_pad_loop_dw) > > - addi r10,r10,8 > > -L(zero_pad_loop_dw_2): > > - rldicl r5,r5,0,61 > > -L(zero_pad_loop_b_start): > > - cmpdi cr7,r5,0 > > - addi r5,r5,-1 > > - addi r9,r10,-1 > > - add r10,r10,5 > > - subf r10,r9,r10 > > - li r8,0 > > - beq- cr7,L(short_path_loop_end) > > - > > - /* Write remaining 1-8 bytes. */ > > - .align 4 > > - addi r9,r9,1 > > - mtocrf 0x1,r10 > > - bf 29,4f > > - stw r8,0(r9) > > - addi r9,r9,4 > > - > > - .align 4 > > -4: bf 30,2f > > - sth r8,0(r9) > > - addi r9,r9,2 > > - > > - .align 4 > > -2: bf 31,1f > > - stb r8,0(r9) > > > > - /* Restore non-volatile registers. */ > > -1: ld r26,-48(r1) > > + /* Restore non-volatile registers and return. */ > > + ld r26,-48(r1) > > ld r27,-40(r1) > > ld r28,-32(r1) > > ld r29,-24(r1) > > @@ -443,10 +436,6 @@ L(short_path_prepare_2_3): > > mr r4,r28 > > mr r9,r29 > > b L(short_path_2) > > -L(zero_pad_loop_b_prepare): > > - addi r10,r9,8 > > - rldicl r5,r5,0,61 > > - b L(zero_pad_loop_b_start) > > L(zero_pad_start_prepare_1): > > mr r5,r6 > > mr r9,r8 > > >
On 26/04/2016 10:41, Gabriel F. T. Gomes wrote: > On Wed, 20 Apr 2016 18:25:37 -0300 > Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote: > >> I am ok with this patch, the only nit is it now calls memset for all >> padding size, where for small padding (less than 16 or maybe 32) I think it would >> be still faster than calling memset (since dcbtst will be used only for >> large inputs). However I still not sure if embedding the memset write_LT_32 >> would yield much gain as well. > > Hi, Adhemerval. Thanks for your review. > > I also expected that for small amounts of padding, calling memset would > harm performance. However, I ran some tests, where memset was only > called for padding amounts greater than 32 bytes, and I didn't notice > any significant changes in performance. That's the reason why I sent > this version of the patch, which always calls memset. > > I also noticed that bench-strncpy exposes a load-hit-store problem, > which might be suppressing the difference for small paddings. I have > plans to fix this in the future, and, if necessary, limit memset to > greater paddings. > > However, right now, I don't see a benefit in adding the check. > > Fair enough then. >> On 20-04-2016 17:55, Gabriel F. T. Gomes wrote: >>> Call __memset_power8 to pad, with zeros, the remaining bytes in the >>> dest string on __strncpy_power8 and __stpncpy_power8. This improves >>> performance when n is larger than the input string, giving ~30% gain for >>> larger strings without impacting much shorter strings. >>> >>> 2016-04-18 Gabriel F. T. Gomes <gftg@linux.vnet.ibm.com> >>> >>> * sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the >>> remaining bytes in the dest string, with zeros. >>> --- >>> sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++---------------- >>> 1 file changed, 56 insertions(+), 67 deletions(-) >>> >>> diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S >>> index 17c3afb..0bb3bd4 100644 >>> --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S >>> +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S >>> @@ -24,6 +24,8 @@ >>> # define FUNC_NAME strncpy >>> #endif >>> >>> +#define FRAMESIZE (FRAME_MIN_SIZE+48) >>> + >>> /* Implements the function >>> >>> char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) >>> @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0) >>> addi r10,r4,16 >>> rlwinm r9,r4,0,19,19 >>> >>> - /* Since it is a leaf function, save some non-volatile registers on the >>> - protected/red zone. */ >>> + /* Save some non-volatile registers on the stack. */ >>> std r26,-48(r1) >>> std r27,-40(r1) >>> >>> @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0) >>> std r30,-16(r1) >>> std r31,-8(r1) >>> >>> + /* Update CFI. */ >>> + cfi_offset(r26, -48) >>> + cfi_offset(r27, -40) >>> + cfi_offset(r28, -32) >>> + cfi_offset(r29, -24) >>> + cfi_offset(r30, -16) >>> + cfi_offset(r31, -8) >>> + >>> beq cr7,L(unaligned_lt_16) >>> rldicl r9,r4,0,61 >>> subfic r8,r9,8 >>> @@ -180,74 +189,58 @@ L(short_path_loop_end): >>> ld r31,-8(r1) >>> blr >>> >>> - /* This code pads the remainder dest with NULL bytes. The algorithm >>> - calculate the remanining size and issues a doubleword unrolled >>> - loops followed by a byte a byte set. */ >>> + /* This code pads the remainder of dest with NULL bytes. The algorithm >>> + calculates the remaining size and calls memset. */ >>> .align 4 >>> L(zero_pad_start): >>> mr r5,r10 >>> mr r9,r6 >>> L(zero_pad_start_1): >>> - srdi. r8,r5,r3 >>> - mr r10,r9 >>> -#ifdef USE_AS_STPNCPY >>> - mr r3,r9 >>> + /* At this point: >>> + - r5 holds the number of bytes that still have to be written to >>> + dest. >>> + - r9 points to the position, in dest, where the first null byte >>> + will be written. >>> + The above statements are true both when control reaches this label >>> + from a branch or when falling through the previous lines. */ >>> +#ifndef USE_AS_STPNCPY >>> + mr r30,r3 /* Save the return value of strncpy. */ >>> +#endif >>> + /* Prepare the call to memset. */ >>> + mr r3,r9 /* Pointer to the area to be zero-filled. */ >>> + li r4,0 /* Byte to be written (zero). */ >>> + >>> + /* We delayed the creation of the stack frame, as well as the saving of >>> + the link register, because only at this point, we are sure that >>> + doing so is actually needed. */ >>> + >>> + /* Save the link register. */ >>> + mflr r0 >>> + std r0,16(r1) >>> + cfi_offset(lr, 16) >>> + >>> + /* Create the stack frame. */ >>> + stdu r1,-FRAMESIZE(r1) >>> + cfi_adjust_cfa_offset(FRAMESIZE) >>> + >>> + bl __memset_power8 >>> + nop >>> + >>> + /* Restore the stack frame. */ >>> + addi r1,r1,FRAMESIZE >>> + cfi_adjust_cfa_offset(-FRAMESIZE) >>> + /* Restore the link register. */ >>> + ld r0,16(r1) >>> + mtlr r0 >>> + >>> +#ifndef USE_AS_STPNCPY >>> + mr r3,r30 /* Restore the return value of strncpy, i.e.: >>> + dest. For stpncpy, the return value is the >>> + same as return value of memset. */ >>> #endif >>> - beq- cr0,L(zero_pad_loop_b_start) >>> - cmpldi cr7,r8,1 >>> - li cr7,0 >>> - std r7,0(r9) >>> - beq cr7,L(zero_pad_loop_b_prepare) >>> - addic. r8,r8,-2 >>> - addi r10,r9,r16 >>> - std r7,8(r9) >>> - beq cr0,L(zero_pad_loop_dw_2) >>> - std r7,16(r9) >>> - li r9,0 >>> - b L(zero_pad_loop_dw_1) >>> - >>> - .align 4 >>> -L(zero_pad_loop_dw): >>> - addi r10,r10,16 >>> - std r9,-8(r10) >>> - beq cr0,L(zero_pad_loop_dw_2) >>> - std r9,0(r10) >>> -L(zero_pad_loop_dw_1): >>> - cmpldi cr7,r8,1 >>> - std r9,0(r10) >>> - addic. r8,r8,-2 >>> - bne cr7,L(zero_pad_loop_dw) >>> - addi r10,r10,8 >>> -L(zero_pad_loop_dw_2): >>> - rldicl r5,r5,0,61 >>> -L(zero_pad_loop_b_start): >>> - cmpdi cr7,r5,0 >>> - addi r5,r5,-1 >>> - addi r9,r10,-1 >>> - add r10,r10,5 >>> - subf r10,r9,r10 >>> - li r8,0 >>> - beq- cr7,L(short_path_loop_end) >>> - >>> - /* Write remaining 1-8 bytes. */ >>> - .align 4 >>> - addi r9,r9,1 >>> - mtocrf 0x1,r10 >>> - bf 29,4f >>> - stw r8,0(r9) >>> - addi r9,r9,4 >>> - >>> - .align 4 >>> -4: bf 30,2f >>> - sth r8,0(r9) >>> - addi r9,r9,2 >>> - >>> - .align 4 >>> -2: bf 31,1f >>> - stb r8,0(r9) >>> >>> - /* Restore non-volatile registers. */ >>> -1: ld r26,-48(r1) >>> + /* Restore non-volatile registers and return. */ >>> + ld r26,-48(r1) >>> ld r27,-40(r1) >>> ld r28,-32(r1) >>> ld r29,-24(r1) >>> @@ -443,10 +436,6 @@ L(short_path_prepare_2_3): >>> mr r4,r28 >>> mr r9,r29 >>> b L(short_path_2) >>> -L(zero_pad_loop_b_prepare): >>> - addi r10,r9,8 >>> - rldicl r5,r5,0,61 >>> - b L(zero_pad_loop_b_start) >>> L(zero_pad_start_prepare_1): >>> mr r5,r6 >>> mr r9,r8 >>> >> >
Thank you for reviewing, Adhemerval. Pushed as 72c11b353ede72931cc474c9071d143d9a05c0d7. On Tue, 26 Apr 2016 11:16:48 -0300 Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote: > On 26/04/2016 10:41, Gabriel F. T. Gomes wrote: > > On Wed, 20 Apr 2016 18:25:37 -0300 > > Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote: > > > >> I am ok with this patch, the only nit is it now calls memset for all > >> padding size, where for small padding (less than 16 or maybe 32) I think it would > >> be still faster than calling memset (since dcbtst will be used only for > >> large inputs). However I still not sure if embedding the memset write_LT_32 > >> would yield much gain as well. > > > > Hi, Adhemerval. Thanks for your review. > > > > I also expected that for small amounts of padding, calling memset would > > harm performance. However, I ran some tests, where memset was only > > called for padding amounts greater than 32 bytes, and I didn't notice > > any significant changes in performance. That's the reason why I sent > > this version of the patch, which always calls memset. > > > > I also noticed that bench-strncpy exposes a load-hit-store problem, > > which might be suppressing the difference for small paddings. I have > > plans to fix this in the future, and, if necessary, limit memset to > > greater paddings. > > > > However, right now, I don't see a benefit in adding the check. > > > > > > Fair enough then. > > >> On 20-04-2016 17:55, Gabriel F. T. Gomes wrote: > >>> Call __memset_power8 to pad, with zeros, the remaining bytes in the > >>> dest string on __strncpy_power8 and __stpncpy_power8. This improves > >>> performance when n is larger than the input string, giving ~30% gain for > >>> larger strings without impacting much shorter strings. > >>> > >>> 2016-04-18 Gabriel F. T. Gomes <gftg@linux.vnet.ibm.com> > >>> > >>> * sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the > >>> remaining bytes in the dest string, with zeros. > >>> --- > >>> sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++---------------- > >>> 1 file changed, 56 insertions(+), 67 deletions(-) > >>> > >>> diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S > >>> index 17c3afb..0bb3bd4 100644 > >>> --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S > >>> +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S > >>> @@ -24,6 +24,8 @@ > >>> # define FUNC_NAME strncpy > >>> #endif > >>> > >>> +#define FRAMESIZE (FRAME_MIN_SIZE+48) > >>> + > >>> /* Implements the function > >>> > >>> char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) > >>> @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0) > >>> addi r10,r4,16 > >>> rlwinm r9,r4,0,19,19 > >>> > >>> - /* Since it is a leaf function, save some non-volatile registers on the > >>> - protected/red zone. */ > >>> + /* Save some non-volatile registers on the stack. */ > >>> std r26,-48(r1) > >>> std r27,-40(r1) > >>> > >>> @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0) > >>> std r30,-16(r1) > >>> std r31,-8(r1) > >>> > >>> + /* Update CFI. */ > >>> + cfi_offset(r26, -48) > >>> + cfi_offset(r27, -40) > >>> + cfi_offset(r28, -32) > >>> + cfi_offset(r29, -24) > >>> + cfi_offset(r30, -16) > >>> + cfi_offset(r31, -8) > >>> + > >>> beq cr7,L(unaligned_lt_16) > >>> rldicl r9,r4,0,61 > >>> subfic r8,r9,8 > >>> @@ -180,74 +189,58 @@ L(short_path_loop_end): > >>> ld r31,-8(r1) > >>> blr > >>> > >>> - /* This code pads the remainder dest with NULL bytes. The algorithm > >>> - calculate the remanining size and issues a doubleword unrolled > >>> - loops followed by a byte a byte set. */ > >>> + /* This code pads the remainder of dest with NULL bytes. The algorithm > >>> + calculates the remaining size and calls memset. */ > >>> .align 4 > >>> L(zero_pad_start): > >>> mr r5,r10 > >>> mr r9,r6 > >>> L(zero_pad_start_1): > >>> - srdi. r8,r5,r3 > >>> - mr r10,r9 > >>> -#ifdef USE_AS_STPNCPY > >>> - mr r3,r9 > >>> + /* At this point: > >>> + - r5 holds the number of bytes that still have to be written to > >>> + dest. > >>> + - r9 points to the position, in dest, where the first null byte > >>> + will be written. > >>> + The above statements are true both when control reaches this label > >>> + from a branch or when falling through the previous lines. */ > >>> +#ifndef USE_AS_STPNCPY > >>> + mr r30,r3 /* Save the return value of strncpy. */ > >>> +#endif > >>> + /* Prepare the call to memset. */ > >>> + mr r3,r9 /* Pointer to the area to be zero-filled. */ > >>> + li r4,0 /* Byte to be written (zero). */ > >>> + > >>> + /* We delayed the creation of the stack frame, as well as the saving of > >>> + the link register, because only at this point, we are sure that > >>> + doing so is actually needed. */ > >>> + > >>> + /* Save the link register. */ > >>> + mflr r0 > >>> + std r0,16(r1) > >>> + cfi_offset(lr, 16) > >>> + > >>> + /* Create the stack frame. */ > >>> + stdu r1,-FRAMESIZE(r1) > >>> + cfi_adjust_cfa_offset(FRAMESIZE) > >>> + > >>> + bl __memset_power8 > >>> + nop > >>> + > >>> + /* Restore the stack frame. */ > >>> + addi r1,r1,FRAMESIZE > >>> + cfi_adjust_cfa_offset(-FRAMESIZE) > >>> + /* Restore the link register. */ > >>> + ld r0,16(r1) > >>> + mtlr r0 > >>> + > >>> +#ifndef USE_AS_STPNCPY > >>> + mr r3,r30 /* Restore the return value of strncpy, i.e.: > >>> + dest. For stpncpy, the return value is the > >>> + same as return value of memset. */ > >>> #endif > >>> - beq- cr0,L(zero_pad_loop_b_start) > >>> - cmpldi cr7,r8,1 > >>> - li cr7,0 > >>> - std r7,0(r9) > >>> - beq cr7,L(zero_pad_loop_b_prepare) > >>> - addic. r8,r8,-2 > >>> - addi r10,r9,r16 > >>> - std r7,8(r9) > >>> - beq cr0,L(zero_pad_loop_dw_2) > >>> - std r7,16(r9) > >>> - li r9,0 > >>> - b L(zero_pad_loop_dw_1) > >>> - > >>> - .align 4 > >>> -L(zero_pad_loop_dw): > >>> - addi r10,r10,16 > >>> - std r9,-8(r10) > >>> - beq cr0,L(zero_pad_loop_dw_2) > >>> - std r9,0(r10) > >>> -L(zero_pad_loop_dw_1): > >>> - cmpldi cr7,r8,1 > >>> - std r9,0(r10) > >>> - addic. r8,r8,-2 > >>> - bne cr7,L(zero_pad_loop_dw) > >>> - addi r10,r10,8 > >>> -L(zero_pad_loop_dw_2): > >>> - rldicl r5,r5,0,61 > >>> -L(zero_pad_loop_b_start): > >>> - cmpdi cr7,r5,0 > >>> - addi r5,r5,-1 > >>> - addi r9,r10,-1 > >>> - add r10,r10,5 > >>> - subf r10,r9,r10 > >>> - li r8,0 > >>> - beq- cr7,L(short_path_loop_end) > >>> - > >>> - /* Write remaining 1-8 bytes. */ > >>> - .align 4 > >>> - addi r9,r9,1 > >>> - mtocrf 0x1,r10 > >>> - bf 29,4f > >>> - stw r8,0(r9) > >>> - addi r9,r9,4 > >>> - > >>> - .align 4 > >>> -4: bf 30,2f > >>> - sth r8,0(r9) > >>> - addi r9,r9,2 > >>> - > >>> - .align 4 > >>> -2: bf 31,1f > >>> - stb r8,0(r9) > >>> > >>> - /* Restore non-volatile registers. */ > >>> -1: ld r26,-48(r1) > >>> + /* Restore non-volatile registers and return. */ > >>> + ld r26,-48(r1) > >>> ld r27,-40(r1) > >>> ld r28,-32(r1) > >>> ld r29,-24(r1) > >>> @@ -443,10 +436,6 @@ L(short_path_prepare_2_3): > >>> mr r4,r28 > >>> mr r9,r29 > >>> b L(short_path_2) > >>> -L(zero_pad_loop_b_prepare): > >>> - addi r10,r9,8 > >>> - rldicl r5,r5,0,61 > >>> - b L(zero_pad_loop_b_start) > >>> L(zero_pad_start_prepare_1): > >>> mr r5,r6 > >>> mr r9,r8 > >>> > >> > > >
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S index 17c3afb..0bb3bd4 100644 --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S @@ -24,6 +24,8 @@ # define FUNC_NAME strncpy #endif +#define FRAMESIZE (FRAME_MIN_SIZE+48) + /* Implements the function char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0) addi r10,r4,16 rlwinm r9,r4,0,19,19 - /* Since it is a leaf function, save some non-volatile registers on the - protected/red zone. */ + /* Save some non-volatile registers on the stack. */ std r26,-48(r1) std r27,-40(r1) @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0) std r30,-16(r1) std r31,-8(r1) + /* Update CFI. */ + cfi_offset(r26, -48) + cfi_offset(r27, -40) + cfi_offset(r28, -32) + cfi_offset(r29, -24) + cfi_offset(r30, -16) + cfi_offset(r31, -8) + beq cr7,L(unaligned_lt_16) rldicl r9,r4,0,61 subfic r8,r9,8 @@ -180,74 +189,58 @@ L(short_path_loop_end): ld r31,-8(r1) blr - /* This code pads the remainder dest with NULL bytes. The algorithm - calculate the remanining size and issues a doubleword unrolled - loops followed by a byte a byte set. */ + /* This code pads the remainder of dest with NULL bytes. The algorithm + calculates the remaining size and calls memset. */ .align 4 L(zero_pad_start): mr r5,r10 mr r9,r6 L(zero_pad_start_1): - srdi. r8,r5,r3 - mr r10,r9 -#ifdef USE_AS_STPNCPY - mr r3,r9 + /* At this point: + - r5 holds the number of bytes that still have to be written to + dest. + - r9 points to the position, in dest, where the first null byte + will be written. + The above statements are true both when control reaches this label + from a branch or when falling through the previous lines. */ +#ifndef USE_AS_STPNCPY + mr r30,r3 /* Save the return value of strncpy. */ +#endif + /* Prepare the call to memset. */ + mr r3,r9 /* Pointer to the area to be zero-filled. */ + li r4,0 /* Byte to be written (zero). */ + + /* We delayed the creation of the stack frame, as well as the saving of + the link register, because only at this point, we are sure that + doing so is actually needed. */ + + /* Save the link register. */ + mflr r0 + std r0,16(r1) + cfi_offset(lr, 16) + + /* Create the stack frame. */ + stdu r1,-FRAMESIZE(r1) + cfi_adjust_cfa_offset(FRAMESIZE) + + bl __memset_power8 + nop + + /* Restore the stack frame. */ + addi r1,r1,FRAMESIZE + cfi_adjust_cfa_offset(-FRAMESIZE) + /* Restore the link register. */ + ld r0,16(r1) + mtlr r0 + +#ifndef USE_AS_STPNCPY + mr r3,r30 /* Restore the return value of strncpy, i.e.: + dest. For stpncpy, the return value is the + same as return value of memset. */ #endif - beq- cr0,L(zero_pad_loop_b_start) - cmpldi cr7,r8,1 - li cr7,0 - std r7,0(r9) - beq cr7,L(zero_pad_loop_b_prepare) - addic. r8,r8,-2 - addi r10,r9,r16 - std r7,8(r9) - beq cr0,L(zero_pad_loop_dw_2) - std r7,16(r9) - li r9,0 - b L(zero_pad_loop_dw_1) - - .align 4 -L(zero_pad_loop_dw): - addi r10,r10,16 - std r9,-8(r10) - beq cr0,L(zero_pad_loop_dw_2) - std r9,0(r10) -L(zero_pad_loop_dw_1): - cmpldi cr7,r8,1 - std r9,0(r10) - addic. r8,r8,-2 - bne cr7,L(zero_pad_loop_dw) - addi r10,r10,8 -L(zero_pad_loop_dw_2): - rldicl r5,r5,0,61 -L(zero_pad_loop_b_start): - cmpdi cr7,r5,0 - addi r5,r5,-1 - addi r9,r10,-1 - add r10,r10,5 - subf r10,r9,r10 - li r8,0 - beq- cr7,L(short_path_loop_end) - - /* Write remaining 1-8 bytes. */ - .align 4 - addi r9,r9,1 - mtocrf 0x1,r10 - bf 29,4f - stw r8,0(r9) - addi r9,r9,4 - - .align 4 -4: bf 30,2f - sth r8,0(r9) - addi r9,r9,2 - - .align 4 -2: bf 31,1f - stb r8,0(r9) - /* Restore non-volatile registers. */ -1: ld r26,-48(r1) + /* Restore non-volatile registers and return. */ + ld r26,-48(r1) ld r27,-40(r1) ld r28,-32(r1) ld r29,-24(r1) @@ -443,10 +436,6 @@ L(short_path_prepare_2_3): mr r4,r28 mr r9,r29 b L(short_path_2) -L(zero_pad_loop_b_prepare): - addi r10,r9,8 - rldicl r5,r5,0,61 - b L(zero_pad_loop_b_start) L(zero_pad_start_prepare_1): mr r5,r6 mr r9,r8