diff mbox

[1/2] powerpc: Zero pad using memset in strncpy/stpncpy

Message ID 1461185716-7332-1-git-send-email-gftg@linux.vnet.ibm.com
State New
Headers show

Commit Message

Gabriel F. T. Gomes April 20, 2016, 8:55 p.m. UTC
Call __memset_power8 to pad, with zeros, the remaining bytes in the
dest string on __strncpy_power8 and __stpncpy_power8.  This improves
performance when n is larger than the input string, giving ~30% gain for
larger strings without impacting much shorter strings.

2016-04-18  Gabriel F. T. Gomes  <gftg@linux.vnet.ibm.com>

	* sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the
	remaining bytes in the dest string, with zeros.
---
 sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++----------------
 1 file changed, 56 insertions(+), 67 deletions(-)

Comments

Adhemerval Zanella April 20, 2016, 9:25 p.m. UTC | #1
I am ok with this patch, the only nit is it now calls memset for all
padding size, where for small padding (less than 16 or maybe 32) I think it would
be still faster than calling memset (since dcbtst will be used only for
large inputs). However I still not sure if embedding the memset write_LT_32
would yield much gain as well.

On 20-04-2016 17:55, Gabriel F. T. Gomes wrote:
> Call __memset_power8 to pad, with zeros, the remaining bytes in the
> dest string on __strncpy_power8 and __stpncpy_power8.  This improves
> performance when n is larger than the input string, giving ~30% gain for
> larger strings without impacting much shorter strings.
> 
> 2016-04-18  Gabriel F. T. Gomes  <gftg@linux.vnet.ibm.com>
> 
> 	* sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the
> 	remaining bytes in the dest string, with zeros.
> ---
>  sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++----------------
>  1 file changed, 56 insertions(+), 67 deletions(-)
> 
> diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
> index 17c3afb..0bb3bd4 100644
> --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
> +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
> @@ -24,6 +24,8 @@
>  # define FUNC_NAME strncpy
>  #endif
>  
> +#define FRAMESIZE (FRAME_MIN_SIZE+48)
> +
>  /* Implements the function
>  
>     char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0)
>  	addi	r10,r4,16
>  	rlwinm	r9,r4,0,19,19
>  
> -	/* Since it is a leaf function, save some non-volatile registers on the
> -	   protected/red zone.  */
> +	/* Save some non-volatile registers on the stack.  */
>  	std	r26,-48(r1)
>  	std	r27,-40(r1)
>  
> @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0)
>  	std	r30,-16(r1)
>  	std	r31,-8(r1)
>  
> +	/* Update CFI.  */
> +	cfi_offset(r26, -48)
> +	cfi_offset(r27, -40)
> +	cfi_offset(r28, -32)
> +	cfi_offset(r29, -24)
> +	cfi_offset(r30, -16)
> +	cfi_offset(r31, -8)
> +
>  	beq	cr7,L(unaligned_lt_16)
>  	rldicl	r9,r4,0,61
>  	subfic	r8,r9,8
> @@ -180,74 +189,58 @@ L(short_path_loop_end):
>  	ld	r31,-8(r1)
>  	blr
>  
> -	/* This code pads the remainder dest with NULL bytes.  The algorithm
> -	   calculate the remanining size and issues a doubleword unrolled
> -	   loops followed by a byte a byte set.  */
> +	/* This code pads the remainder of dest with NULL bytes.  The algorithm
> +	   calculates the remaining size and calls memset.  */
>  	.align	4
>  L(zero_pad_start):
>  	mr	r5,r10
>  	mr	r9,r6
>  L(zero_pad_start_1):
> -	srdi.	r8,r5,r3
> -	mr	r10,r9
> -#ifdef USE_AS_STPNCPY
> -	mr	r3,r9
> +	/* At this point:
> +	     - r5 holds the number of bytes that still have to be written to
> +	       dest.
> +	     - r9 points to the position, in dest, where the first null byte
> +	       will be written.
> +	   The above statements are true both when control reaches this label
> +	   from a branch or when falling through the previous lines.  */
> +#ifndef USE_AS_STPNCPY
> +	mr	r30,r3       /* Save the return value of strncpy.  */
> +#endif
> +	/* Prepare the call to memset.  */
> +	mr	r3,r9        /* Pointer to the area to be zero-filled.  */
> +	li	r4,0         /* Byte to be written (zero).  */
> +
> +	/* We delayed the creation of the stack frame, as well as the saving of
> +	   the link register, because only at this point, we are sure that
> +	   doing so is actually needed.  */
> +
> +	/* Save the link register.  */
> +	mflr	r0
> +	std	r0,16(r1)
> +	cfi_offset(lr, 16)
> +
> +	/* Create the stack frame.  */
> +	stdu	r1,-FRAMESIZE(r1)
> +	cfi_adjust_cfa_offset(FRAMESIZE)
> +
> +	bl	__memset_power8
> +	nop
> +
> +	/* Restore the stack frame.  */
> +	addi	r1,r1,FRAMESIZE
> +	cfi_adjust_cfa_offset(-FRAMESIZE)
> +	/* Restore the link register.  */
> +	ld	r0,16(r1)
> +	mtlr	r0
> +
> +#ifndef USE_AS_STPNCPY
> +	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
> +				dest.  For stpncpy, the return value is the
> +				same as return value of memset.  */
>  #endif
> -	beq-	cr0,L(zero_pad_loop_b_start)
> -	cmpldi	cr7,r8,1
> -	li	cr7,0
> -	std	r7,0(r9)
> -	beq	cr7,L(zero_pad_loop_b_prepare)
> -	addic.	r8,r8,-2
> -	addi	r10,r9,r16
> -	std	r7,8(r9)
> -	beq	cr0,L(zero_pad_loop_dw_2)
> -	std	r7,16(r9)
> -	li	r9,0
> -	b	L(zero_pad_loop_dw_1)
> -
> -	.align	4
> -L(zero_pad_loop_dw):
> -	addi	r10,r10,16
> -	std	r9,-8(r10)
> -	beq	cr0,L(zero_pad_loop_dw_2)
> -	std	r9,0(r10)
> -L(zero_pad_loop_dw_1):
> -	cmpldi	cr7,r8,1
> -	std	r9,0(r10)
> -	addic.	r8,r8,-2
> -	bne	cr7,L(zero_pad_loop_dw)
> -	addi	r10,r10,8
> -L(zero_pad_loop_dw_2):
> -	rldicl	r5,r5,0,61
> -L(zero_pad_loop_b_start):
> -	cmpdi	cr7,r5,0
> -	addi	r5,r5,-1
> -	addi	r9,r10,-1
> -	add	r10,r10,5
> -	subf	r10,r9,r10
> -	li	r8,0
> -	beq-	cr7,L(short_path_loop_end)
> -
> -	/* Write remaining 1-8 bytes.  */
> -        .align  4
> -	addi	r9,r9,1
> -	mtocrf	0x1,r10
> -	bf	29,4f
> -        stw     r8,0(r9)
> -        addi	r9,r9,4
> -
> -        .align  4
> -4:      bf      30,2f
> -        sth     r8,0(r9)
> -        addi	r9,r9,2
> -
> -        .align  4
> -2:      bf	31,1f
> -        stb	r8,0(r9)
>  
> -	/* Restore non-volatile registers.  */
> -1:	ld	r26,-48(r1)
> +	/* Restore non-volatile registers and return.  */
> +	ld	r26,-48(r1)
>  	ld	r27,-40(r1)
>  	ld	r28,-32(r1)
>  	ld	r29,-24(r1)
> @@ -443,10 +436,6 @@ L(short_path_prepare_2_3):
>  	mr	r4,r28
>  	mr	r9,r29
>  	b	L(short_path_2)
> -L(zero_pad_loop_b_prepare):
> -	addi	r10,r9,8
> -	rldicl	r5,r5,0,61
> -	b	L(zero_pad_loop_b_start)
>  L(zero_pad_start_prepare_1):
>  	mr	r5,r6
>  	mr	r9,r8
>
Gabriel F. T. Gomes April 26, 2016, 1:41 p.m. UTC | #2
On Wed, 20 Apr 2016 18:25:37 -0300
Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:

> I am ok with this patch, the only nit is it now calls memset for all
> padding size, where for small padding (less than 16 or maybe 32) I think it would
> be still faster than calling memset (since dcbtst will be used only for
> large inputs). However I still not sure if embedding the memset write_LT_32
> would yield much gain as well.

Hi, Adhemerval. Thanks for your review.

I also expected that for small amounts of padding, calling memset would
harm performance. However, I ran some tests, where memset was only
called for padding amounts greater than 32 bytes, and I didn't notice
any significant changes in performance. That's the reason why I sent
this version of the patch, which always calls memset.

I also noticed that bench-strncpy exposes a load-hit-store problem,
which might be suppressing the difference for small paddings. I have
plans to fix this in the future, and, if necessary, limit memset to
greater paddings.

However, right now, I don't see a benefit in adding the check.

 
> On 20-04-2016 17:55, Gabriel F. T. Gomes wrote:
> > Call __memset_power8 to pad, with zeros, the remaining bytes in the
> > dest string on __strncpy_power8 and __stpncpy_power8.  This improves
> > performance when n is larger than the input string, giving ~30% gain for
> > larger strings without impacting much shorter strings.
> > 
> > 2016-04-18  Gabriel F. T. Gomes  <gftg@linux.vnet.ibm.com>
> > 
> > 	* sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the
> > 	remaining bytes in the dest string, with zeros.
> > ---
> >  sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++----------------
> >  1 file changed, 56 insertions(+), 67 deletions(-)
> > 
> > diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
> > index 17c3afb..0bb3bd4 100644
> > --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
> > +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
> > @@ -24,6 +24,8 @@
> >  # define FUNC_NAME strncpy
> >  #endif
> >  
> > +#define FRAMESIZE (FRAME_MIN_SIZE+48)
> > +
> >  /* Implements the function
> >  
> >     char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> > @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0)
> >  	addi	r10,r4,16
> >  	rlwinm	r9,r4,0,19,19
> >  
> > -	/* Since it is a leaf function, save some non-volatile registers on the
> > -	   protected/red zone.  */
> > +	/* Save some non-volatile registers on the stack.  */
> >  	std	r26,-48(r1)
> >  	std	r27,-40(r1)
> >  
> > @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0)
> >  	std	r30,-16(r1)
> >  	std	r31,-8(r1)
> >  
> > +	/* Update CFI.  */
> > +	cfi_offset(r26, -48)
> > +	cfi_offset(r27, -40)
> > +	cfi_offset(r28, -32)
> > +	cfi_offset(r29, -24)
> > +	cfi_offset(r30, -16)
> > +	cfi_offset(r31, -8)
> > +
> >  	beq	cr7,L(unaligned_lt_16)
> >  	rldicl	r9,r4,0,61
> >  	subfic	r8,r9,8
> > @@ -180,74 +189,58 @@ L(short_path_loop_end):
> >  	ld	r31,-8(r1)
> >  	blr
> >  
> > -	/* This code pads the remainder dest with NULL bytes.  The algorithm
> > -	   calculate the remanining size and issues a doubleword unrolled
> > -	   loops followed by a byte a byte set.  */
> > +	/* This code pads the remainder of dest with NULL bytes.  The algorithm
> > +	   calculates the remaining size and calls memset.  */
> >  	.align	4
> >  L(zero_pad_start):
> >  	mr	r5,r10
> >  	mr	r9,r6
> >  L(zero_pad_start_1):
> > -	srdi.	r8,r5,r3
> > -	mr	r10,r9
> > -#ifdef USE_AS_STPNCPY
> > -	mr	r3,r9
> > +	/* At this point:
> > +	     - r5 holds the number of bytes that still have to be written to
> > +	       dest.
> > +	     - r9 points to the position, in dest, where the first null byte
> > +	       will be written.
> > +	   The above statements are true both when control reaches this label
> > +	   from a branch or when falling through the previous lines.  */
> > +#ifndef USE_AS_STPNCPY
> > +	mr	r30,r3       /* Save the return value of strncpy.  */
> > +#endif
> > +	/* Prepare the call to memset.  */
> > +	mr	r3,r9        /* Pointer to the area to be zero-filled.  */
> > +	li	r4,0         /* Byte to be written (zero).  */
> > +
> > +	/* We delayed the creation of the stack frame, as well as the saving of
> > +	   the link register, because only at this point, we are sure that
> > +	   doing so is actually needed.  */
> > +
> > +	/* Save the link register.  */
> > +	mflr	r0
> > +	std	r0,16(r1)
> > +	cfi_offset(lr, 16)
> > +
> > +	/* Create the stack frame.  */
> > +	stdu	r1,-FRAMESIZE(r1)
> > +	cfi_adjust_cfa_offset(FRAMESIZE)
> > +
> > +	bl	__memset_power8
> > +	nop
> > +
> > +	/* Restore the stack frame.  */
> > +	addi	r1,r1,FRAMESIZE
> > +	cfi_adjust_cfa_offset(-FRAMESIZE)
> > +	/* Restore the link register.  */
> > +	ld	r0,16(r1)
> > +	mtlr	r0
> > +
> > +#ifndef USE_AS_STPNCPY
> > +	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
> > +				dest.  For stpncpy, the return value is the
> > +				same as return value of memset.  */
> >  #endif
> > -	beq-	cr0,L(zero_pad_loop_b_start)
> > -	cmpldi	cr7,r8,1
> > -	li	cr7,0
> > -	std	r7,0(r9)
> > -	beq	cr7,L(zero_pad_loop_b_prepare)
> > -	addic.	r8,r8,-2
> > -	addi	r10,r9,r16
> > -	std	r7,8(r9)
> > -	beq	cr0,L(zero_pad_loop_dw_2)
> > -	std	r7,16(r9)
> > -	li	r9,0
> > -	b	L(zero_pad_loop_dw_1)
> > -
> > -	.align	4
> > -L(zero_pad_loop_dw):
> > -	addi	r10,r10,16
> > -	std	r9,-8(r10)
> > -	beq	cr0,L(zero_pad_loop_dw_2)
> > -	std	r9,0(r10)
> > -L(zero_pad_loop_dw_1):
> > -	cmpldi	cr7,r8,1
> > -	std	r9,0(r10)
> > -	addic.	r8,r8,-2
> > -	bne	cr7,L(zero_pad_loop_dw)
> > -	addi	r10,r10,8
> > -L(zero_pad_loop_dw_2):
> > -	rldicl	r5,r5,0,61
> > -L(zero_pad_loop_b_start):
> > -	cmpdi	cr7,r5,0
> > -	addi	r5,r5,-1
> > -	addi	r9,r10,-1
> > -	add	r10,r10,5
> > -	subf	r10,r9,r10
> > -	li	r8,0
> > -	beq-	cr7,L(short_path_loop_end)
> > -
> > -	/* Write remaining 1-8 bytes.  */
> > -        .align  4
> > -	addi	r9,r9,1
> > -	mtocrf	0x1,r10
> > -	bf	29,4f
> > -        stw     r8,0(r9)
> > -        addi	r9,r9,4
> > -
> > -        .align  4
> > -4:      bf      30,2f
> > -        sth     r8,0(r9)
> > -        addi	r9,r9,2
> > -
> > -        .align  4
> > -2:      bf	31,1f
> > -        stb	r8,0(r9)
> >  
> > -	/* Restore non-volatile registers.  */
> > -1:	ld	r26,-48(r1)
> > +	/* Restore non-volatile registers and return.  */
> > +	ld	r26,-48(r1)
> >  	ld	r27,-40(r1)
> >  	ld	r28,-32(r1)
> >  	ld	r29,-24(r1)
> > @@ -443,10 +436,6 @@ L(short_path_prepare_2_3):
> >  	mr	r4,r28
> >  	mr	r9,r29
> >  	b	L(short_path_2)
> > -L(zero_pad_loop_b_prepare):
> > -	addi	r10,r9,8
> > -	rldicl	r5,r5,0,61
> > -	b	L(zero_pad_loop_b_start)
> >  L(zero_pad_start_prepare_1):
> >  	mr	r5,r6
> >  	mr	r9,r8
> >   
>
Adhemerval Zanella April 26, 2016, 2:16 p.m. UTC | #3
On 26/04/2016 10:41, Gabriel F. T. Gomes wrote:
> On Wed, 20 Apr 2016 18:25:37 -0300
> Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:
> 
>> I am ok with this patch, the only nit is it now calls memset for all
>> padding size, where for small padding (less than 16 or maybe 32) I think it would
>> be still faster than calling memset (since dcbtst will be used only for
>> large inputs). However I still not sure if embedding the memset write_LT_32
>> would yield much gain as well.
> 
> Hi, Adhemerval. Thanks for your review.
> 
> I also expected that for small amounts of padding, calling memset would
> harm performance. However, I ran some tests, where memset was only
> called for padding amounts greater than 32 bytes, and I didn't notice
> any significant changes in performance. That's the reason why I sent
> this version of the patch, which always calls memset.
> 
> I also noticed that bench-strncpy exposes a load-hit-store problem,
> which might be suppressing the difference for small paddings. I have
> plans to fix this in the future, and, if necessary, limit memset to
> greater paddings.
> 
> However, right now, I don't see a benefit in adding the check.
> 
>  

Fair enough then.

>> On 20-04-2016 17:55, Gabriel F. T. Gomes wrote:
>>> Call __memset_power8 to pad, with zeros, the remaining bytes in the
>>> dest string on __strncpy_power8 and __stpncpy_power8.  This improves
>>> performance when n is larger than the input string, giving ~30% gain for
>>> larger strings without impacting much shorter strings.
>>>
>>> 2016-04-18  Gabriel F. T. Gomes  <gftg@linux.vnet.ibm.com>
>>>
>>> 	* sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the
>>> 	remaining bytes in the dest string, with zeros.
>>> ---
>>>  sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++----------------
>>>  1 file changed, 56 insertions(+), 67 deletions(-)
>>>
>>> diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
>>> index 17c3afb..0bb3bd4 100644
>>> --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
>>> +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
>>> @@ -24,6 +24,8 @@
>>>  # define FUNC_NAME strncpy
>>>  #endif
>>>  
>>> +#define FRAMESIZE (FRAME_MIN_SIZE+48)
>>> +
>>>  /* Implements the function
>>>  
>>>     char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
>>> @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0)
>>>  	addi	r10,r4,16
>>>  	rlwinm	r9,r4,0,19,19
>>>  
>>> -	/* Since it is a leaf function, save some non-volatile registers on the
>>> -	   protected/red zone.  */
>>> +	/* Save some non-volatile registers on the stack.  */
>>>  	std	r26,-48(r1)
>>>  	std	r27,-40(r1)
>>>  
>>> @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0)
>>>  	std	r30,-16(r1)
>>>  	std	r31,-8(r1)
>>>  
>>> +	/* Update CFI.  */
>>> +	cfi_offset(r26, -48)
>>> +	cfi_offset(r27, -40)
>>> +	cfi_offset(r28, -32)
>>> +	cfi_offset(r29, -24)
>>> +	cfi_offset(r30, -16)
>>> +	cfi_offset(r31, -8)
>>> +
>>>  	beq	cr7,L(unaligned_lt_16)
>>>  	rldicl	r9,r4,0,61
>>>  	subfic	r8,r9,8
>>> @@ -180,74 +189,58 @@ L(short_path_loop_end):
>>>  	ld	r31,-8(r1)
>>>  	blr
>>>  
>>> -	/* This code pads the remainder dest with NULL bytes.  The algorithm
>>> -	   calculate the remanining size and issues a doubleword unrolled
>>> -	   loops followed by a byte a byte set.  */
>>> +	/* This code pads the remainder of dest with NULL bytes.  The algorithm
>>> +	   calculates the remaining size and calls memset.  */
>>>  	.align	4
>>>  L(zero_pad_start):
>>>  	mr	r5,r10
>>>  	mr	r9,r6
>>>  L(zero_pad_start_1):
>>> -	srdi.	r8,r5,r3
>>> -	mr	r10,r9
>>> -#ifdef USE_AS_STPNCPY
>>> -	mr	r3,r9
>>> +	/* At this point:
>>> +	     - r5 holds the number of bytes that still have to be written to
>>> +	       dest.
>>> +	     - r9 points to the position, in dest, where the first null byte
>>> +	       will be written.
>>> +	   The above statements are true both when control reaches this label
>>> +	   from a branch or when falling through the previous lines.  */
>>> +#ifndef USE_AS_STPNCPY
>>> +	mr	r30,r3       /* Save the return value of strncpy.  */
>>> +#endif
>>> +	/* Prepare the call to memset.  */
>>> +	mr	r3,r9        /* Pointer to the area to be zero-filled.  */
>>> +	li	r4,0         /* Byte to be written (zero).  */
>>> +
>>> +	/* We delayed the creation of the stack frame, as well as the saving of
>>> +	   the link register, because only at this point, we are sure that
>>> +	   doing so is actually needed.  */
>>> +
>>> +	/* Save the link register.  */
>>> +	mflr	r0
>>> +	std	r0,16(r1)
>>> +	cfi_offset(lr, 16)
>>> +
>>> +	/* Create the stack frame.  */
>>> +	stdu	r1,-FRAMESIZE(r1)
>>> +	cfi_adjust_cfa_offset(FRAMESIZE)
>>> +
>>> +	bl	__memset_power8
>>> +	nop
>>> +
>>> +	/* Restore the stack frame.  */
>>> +	addi	r1,r1,FRAMESIZE
>>> +	cfi_adjust_cfa_offset(-FRAMESIZE)
>>> +	/* Restore the link register.  */
>>> +	ld	r0,16(r1)
>>> +	mtlr	r0
>>> +
>>> +#ifndef USE_AS_STPNCPY
>>> +	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
>>> +				dest.  For stpncpy, the return value is the
>>> +				same as return value of memset.  */
>>>  #endif
>>> -	beq-	cr0,L(zero_pad_loop_b_start)
>>> -	cmpldi	cr7,r8,1
>>> -	li	cr7,0
>>> -	std	r7,0(r9)
>>> -	beq	cr7,L(zero_pad_loop_b_prepare)
>>> -	addic.	r8,r8,-2
>>> -	addi	r10,r9,r16
>>> -	std	r7,8(r9)
>>> -	beq	cr0,L(zero_pad_loop_dw_2)
>>> -	std	r7,16(r9)
>>> -	li	r9,0
>>> -	b	L(zero_pad_loop_dw_1)
>>> -
>>> -	.align	4
>>> -L(zero_pad_loop_dw):
>>> -	addi	r10,r10,16
>>> -	std	r9,-8(r10)
>>> -	beq	cr0,L(zero_pad_loop_dw_2)
>>> -	std	r9,0(r10)
>>> -L(zero_pad_loop_dw_1):
>>> -	cmpldi	cr7,r8,1
>>> -	std	r9,0(r10)
>>> -	addic.	r8,r8,-2
>>> -	bne	cr7,L(zero_pad_loop_dw)
>>> -	addi	r10,r10,8
>>> -L(zero_pad_loop_dw_2):
>>> -	rldicl	r5,r5,0,61
>>> -L(zero_pad_loop_b_start):
>>> -	cmpdi	cr7,r5,0
>>> -	addi	r5,r5,-1
>>> -	addi	r9,r10,-1
>>> -	add	r10,r10,5
>>> -	subf	r10,r9,r10
>>> -	li	r8,0
>>> -	beq-	cr7,L(short_path_loop_end)
>>> -
>>> -	/* Write remaining 1-8 bytes.  */
>>> -        .align  4
>>> -	addi	r9,r9,1
>>> -	mtocrf	0x1,r10
>>> -	bf	29,4f
>>> -        stw     r8,0(r9)
>>> -        addi	r9,r9,4
>>> -
>>> -        .align  4
>>> -4:      bf      30,2f
>>> -        sth     r8,0(r9)
>>> -        addi	r9,r9,2
>>> -
>>> -        .align  4
>>> -2:      bf	31,1f
>>> -        stb	r8,0(r9)
>>>  
>>> -	/* Restore non-volatile registers.  */
>>> -1:	ld	r26,-48(r1)
>>> +	/* Restore non-volatile registers and return.  */
>>> +	ld	r26,-48(r1)
>>>  	ld	r27,-40(r1)
>>>  	ld	r28,-32(r1)
>>>  	ld	r29,-24(r1)
>>> @@ -443,10 +436,6 @@ L(short_path_prepare_2_3):
>>>  	mr	r4,r28
>>>  	mr	r9,r29
>>>  	b	L(short_path_2)
>>> -L(zero_pad_loop_b_prepare):
>>> -	addi	r10,r9,8
>>> -	rldicl	r5,r5,0,61
>>> -	b	L(zero_pad_loop_b_start)
>>>  L(zero_pad_start_prepare_1):
>>>  	mr	r5,r6
>>>  	mr	r9,r8
>>>   
>>
>
Gabriel F. T. Gomes April 29, 2016, 1:47 p.m. UTC | #4
Thank you for reviewing, Adhemerval.

Pushed as 72c11b353ede72931cc474c9071d143d9a05c0d7.

On Tue, 26 Apr 2016 11:16:48 -0300
Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:

> On 26/04/2016 10:41, Gabriel F. T. Gomes wrote:
> > On Wed, 20 Apr 2016 18:25:37 -0300
> > Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:
> >   
> >> I am ok with this patch, the only nit is it now calls memset for all
> >> padding size, where for small padding (less than 16 or maybe 32) I think it would
> >> be still faster than calling memset (since dcbtst will be used only for
> >> large inputs). However I still not sure if embedding the memset write_LT_32
> >> would yield much gain as well.  
> > 
> > Hi, Adhemerval. Thanks for your review.
> > 
> > I also expected that for small amounts of padding, calling memset would
> > harm performance. However, I ran some tests, where memset was only
> > called for padding amounts greater than 32 bytes, and I didn't notice
> > any significant changes in performance. That's the reason why I sent
> > this version of the patch, which always calls memset.
> > 
> > I also noticed that bench-strncpy exposes a load-hit-store problem,
> > which might be suppressing the difference for small paddings. I have
> > plans to fix this in the future, and, if necessary, limit memset to
> > greater paddings.
> > 
> > However, right now, I don't see a benefit in adding the check.
> > 
> >    
> 
> Fair enough then.
> 
> >> On 20-04-2016 17:55, Gabriel F. T. Gomes wrote:  
> >>> Call __memset_power8 to pad, with zeros, the remaining bytes in the
> >>> dest string on __strncpy_power8 and __stpncpy_power8.  This improves
> >>> performance when n is larger than the input string, giving ~30% gain for
> >>> larger strings without impacting much shorter strings.
> >>>
> >>> 2016-04-18  Gabriel F. T. Gomes  <gftg@linux.vnet.ibm.com>
> >>>
> >>> 	* sysdeps/powerpc/powerpc64/power8/strncpy.S: Call memset to pad the
> >>> 	remaining bytes in the dest string, with zeros.
> >>> ---
> >>>  sysdeps/powerpc/powerpc64/power8/strncpy.S | 123 +++++++++++++----------------
> >>>  1 file changed, 56 insertions(+), 67 deletions(-)
> >>>
> >>> diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
> >>> index 17c3afb..0bb3bd4 100644
> >>> --- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
> >>> +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
> >>> @@ -24,6 +24,8 @@
> >>>  # define FUNC_NAME strncpy
> >>>  #endif
> >>>  
> >>> +#define FRAMESIZE (FRAME_MIN_SIZE+48)
> >>> +
> >>>  /* Implements the function
> >>>  
> >>>     char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> >>> @@ -54,8 +56,7 @@ EALIGN (FUNC_NAME, 4, 0)
> >>>  	addi	r10,r4,16
> >>>  	rlwinm	r9,r4,0,19,19
> >>>  
> >>> -	/* Since it is a leaf function, save some non-volatile registers on the
> >>> -	   protected/red zone.  */
> >>> +	/* Save some non-volatile registers on the stack.  */
> >>>  	std	r26,-48(r1)
> >>>  	std	r27,-40(r1)
> >>>  
> >>> @@ -69,6 +70,14 @@ EALIGN (FUNC_NAME, 4, 0)
> >>>  	std	r30,-16(r1)
> >>>  	std	r31,-8(r1)
> >>>  
> >>> +	/* Update CFI.  */
> >>> +	cfi_offset(r26, -48)
> >>> +	cfi_offset(r27, -40)
> >>> +	cfi_offset(r28, -32)
> >>> +	cfi_offset(r29, -24)
> >>> +	cfi_offset(r30, -16)
> >>> +	cfi_offset(r31, -8)
> >>> +
> >>>  	beq	cr7,L(unaligned_lt_16)
> >>>  	rldicl	r9,r4,0,61
> >>>  	subfic	r8,r9,8
> >>> @@ -180,74 +189,58 @@ L(short_path_loop_end):
> >>>  	ld	r31,-8(r1)
> >>>  	blr
> >>>  
> >>> -	/* This code pads the remainder dest with NULL bytes.  The algorithm
> >>> -	   calculate the remanining size and issues a doubleword unrolled
> >>> -	   loops followed by a byte a byte set.  */
> >>> +	/* This code pads the remainder of dest with NULL bytes.  The algorithm
> >>> +	   calculates the remaining size and calls memset.  */
> >>>  	.align	4
> >>>  L(zero_pad_start):
> >>>  	mr	r5,r10
> >>>  	mr	r9,r6
> >>>  L(zero_pad_start_1):
> >>> -	srdi.	r8,r5,r3
> >>> -	mr	r10,r9
> >>> -#ifdef USE_AS_STPNCPY
> >>> -	mr	r3,r9
> >>> +	/* At this point:
> >>> +	     - r5 holds the number of bytes that still have to be written to
> >>> +	       dest.
> >>> +	     - r9 points to the position, in dest, where the first null byte
> >>> +	       will be written.
> >>> +	   The above statements are true both when control reaches this label
> >>> +	   from a branch or when falling through the previous lines.  */
> >>> +#ifndef USE_AS_STPNCPY
> >>> +	mr	r30,r3       /* Save the return value of strncpy.  */
> >>> +#endif
> >>> +	/* Prepare the call to memset.  */
> >>> +	mr	r3,r9        /* Pointer to the area to be zero-filled.  */
> >>> +	li	r4,0         /* Byte to be written (zero).  */
> >>> +
> >>> +	/* We delayed the creation of the stack frame, as well as the saving of
> >>> +	   the link register, because only at this point, we are sure that
> >>> +	   doing so is actually needed.  */
> >>> +
> >>> +	/* Save the link register.  */
> >>> +	mflr	r0
> >>> +	std	r0,16(r1)
> >>> +	cfi_offset(lr, 16)
> >>> +
> >>> +	/* Create the stack frame.  */
> >>> +	stdu	r1,-FRAMESIZE(r1)
> >>> +	cfi_adjust_cfa_offset(FRAMESIZE)
> >>> +
> >>> +	bl	__memset_power8
> >>> +	nop
> >>> +
> >>> +	/* Restore the stack frame.  */
> >>> +	addi	r1,r1,FRAMESIZE
> >>> +	cfi_adjust_cfa_offset(-FRAMESIZE)
> >>> +	/* Restore the link register.  */
> >>> +	ld	r0,16(r1)
> >>> +	mtlr	r0
> >>> +
> >>> +#ifndef USE_AS_STPNCPY
> >>> +	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
> >>> +				dest.  For stpncpy, the return value is the
> >>> +				same as return value of memset.  */
> >>>  #endif
> >>> -	beq-	cr0,L(zero_pad_loop_b_start)
> >>> -	cmpldi	cr7,r8,1
> >>> -	li	cr7,0
> >>> -	std	r7,0(r9)
> >>> -	beq	cr7,L(zero_pad_loop_b_prepare)
> >>> -	addic.	r8,r8,-2
> >>> -	addi	r10,r9,r16
> >>> -	std	r7,8(r9)
> >>> -	beq	cr0,L(zero_pad_loop_dw_2)
> >>> -	std	r7,16(r9)
> >>> -	li	r9,0
> >>> -	b	L(zero_pad_loop_dw_1)
> >>> -
> >>> -	.align	4
> >>> -L(zero_pad_loop_dw):
> >>> -	addi	r10,r10,16
> >>> -	std	r9,-8(r10)
> >>> -	beq	cr0,L(zero_pad_loop_dw_2)
> >>> -	std	r9,0(r10)
> >>> -L(zero_pad_loop_dw_1):
> >>> -	cmpldi	cr7,r8,1
> >>> -	std	r9,0(r10)
> >>> -	addic.	r8,r8,-2
> >>> -	bne	cr7,L(zero_pad_loop_dw)
> >>> -	addi	r10,r10,8
> >>> -L(zero_pad_loop_dw_2):
> >>> -	rldicl	r5,r5,0,61
> >>> -L(zero_pad_loop_b_start):
> >>> -	cmpdi	cr7,r5,0
> >>> -	addi	r5,r5,-1
> >>> -	addi	r9,r10,-1
> >>> -	add	r10,r10,5
> >>> -	subf	r10,r9,r10
> >>> -	li	r8,0
> >>> -	beq-	cr7,L(short_path_loop_end)
> >>> -
> >>> -	/* Write remaining 1-8 bytes.  */
> >>> -        .align  4
> >>> -	addi	r9,r9,1
> >>> -	mtocrf	0x1,r10
> >>> -	bf	29,4f
> >>> -        stw     r8,0(r9)
> >>> -        addi	r9,r9,4
> >>> -
> >>> -        .align  4
> >>> -4:      bf      30,2f
> >>> -        sth     r8,0(r9)
> >>> -        addi	r9,r9,2
> >>> -
> >>> -        .align  4
> >>> -2:      bf	31,1f
> >>> -        stb	r8,0(r9)
> >>>  
> >>> -	/* Restore non-volatile registers.  */
> >>> -1:	ld	r26,-48(r1)
> >>> +	/* Restore non-volatile registers and return.  */
> >>> +	ld	r26,-48(r1)
> >>>  	ld	r27,-40(r1)
> >>>  	ld	r28,-32(r1)
> >>>  	ld	r29,-24(r1)
> >>> @@ -443,10 +436,6 @@ L(short_path_prepare_2_3):
> >>>  	mr	r4,r28
> >>>  	mr	r9,r29
> >>>  	b	L(short_path_2)
> >>> -L(zero_pad_loop_b_prepare):
> >>> -	addi	r10,r9,8
> >>> -	rldicl	r5,r5,0,61
> >>> -	b	L(zero_pad_loop_b_start)
> >>>  L(zero_pad_start_prepare_1):
> >>>  	mr	r5,r6
> >>>  	mr	r9,r8
> >>>     
> >>  
> >   
>
diff mbox

Patch

diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
index 17c3afb..0bb3bd4 100644
--- a/sysdeps/powerpc/powerpc64/power8/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
@@ -24,6 +24,8 @@ 
 # define FUNC_NAME strncpy
 #endif
 
+#define FRAMESIZE (FRAME_MIN_SIZE+48)
+
 /* Implements the function
 
    char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
@@ -54,8 +56,7 @@  EALIGN (FUNC_NAME, 4, 0)
 	addi	r10,r4,16
 	rlwinm	r9,r4,0,19,19
 
-	/* Since it is a leaf function, save some non-volatile registers on the
-	   protected/red zone.  */
+	/* Save some non-volatile registers on the stack.  */
 	std	r26,-48(r1)
 	std	r27,-40(r1)
 
@@ -69,6 +70,14 @@  EALIGN (FUNC_NAME, 4, 0)
 	std	r30,-16(r1)
 	std	r31,-8(r1)
 
+	/* Update CFI.  */
+	cfi_offset(r26, -48)
+	cfi_offset(r27, -40)
+	cfi_offset(r28, -32)
+	cfi_offset(r29, -24)
+	cfi_offset(r30, -16)
+	cfi_offset(r31, -8)
+
 	beq	cr7,L(unaligned_lt_16)
 	rldicl	r9,r4,0,61
 	subfic	r8,r9,8
@@ -180,74 +189,58 @@  L(short_path_loop_end):
 	ld	r31,-8(r1)
 	blr
 
-	/* This code pads the remainder dest with NULL bytes.  The algorithm
-	   calculate the remanining size and issues a doubleword unrolled
-	   loops followed by a byte a byte set.  */
+	/* This code pads the remainder of dest with NULL bytes.  The algorithm
+	   calculates the remaining size and calls memset.  */
 	.align	4
 L(zero_pad_start):
 	mr	r5,r10
 	mr	r9,r6
 L(zero_pad_start_1):
-	srdi.	r8,r5,r3
-	mr	r10,r9
-#ifdef USE_AS_STPNCPY
-	mr	r3,r9
+	/* At this point:
+	     - r5 holds the number of bytes that still have to be written to
+	       dest.
+	     - r9 points to the position, in dest, where the first null byte
+	       will be written.
+	   The above statements are true both when control reaches this label
+	   from a branch or when falling through the previous lines.  */
+#ifndef USE_AS_STPNCPY
+	mr	r30,r3       /* Save the return value of strncpy.  */
+#endif
+	/* Prepare the call to memset.  */
+	mr	r3,r9        /* Pointer to the area to be zero-filled.  */
+	li	r4,0         /* Byte to be written (zero).  */
+
+	/* We delayed the creation of the stack frame, as well as the saving of
+	   the link register, because only at this point, we are sure that
+	   doing so is actually needed.  */
+
+	/* Save the link register.  */
+	mflr	r0
+	std	r0,16(r1)
+	cfi_offset(lr, 16)
+
+	/* Create the stack frame.  */
+	stdu	r1,-FRAMESIZE(r1)
+	cfi_adjust_cfa_offset(FRAMESIZE)
+
+	bl	__memset_power8
+	nop
+
+	/* Restore the stack frame.  */
+	addi	r1,r1,FRAMESIZE
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	/* Restore the link register.  */
+	ld	r0,16(r1)
+	mtlr	r0
+
+#ifndef USE_AS_STPNCPY
+	mr	r3,r30       /* Restore the return value of strncpy, i.e.:
+				dest.  For stpncpy, the return value is the
+				same as return value of memset.  */
 #endif
-	beq-	cr0,L(zero_pad_loop_b_start)
-	cmpldi	cr7,r8,1
-	li	cr7,0
-	std	r7,0(r9)
-	beq	cr7,L(zero_pad_loop_b_prepare)
-	addic.	r8,r8,-2
-	addi	r10,r9,r16
-	std	r7,8(r9)
-	beq	cr0,L(zero_pad_loop_dw_2)
-	std	r7,16(r9)
-	li	r9,0
-	b	L(zero_pad_loop_dw_1)
-
-	.align	4
-L(zero_pad_loop_dw):
-	addi	r10,r10,16
-	std	r9,-8(r10)
-	beq	cr0,L(zero_pad_loop_dw_2)
-	std	r9,0(r10)
-L(zero_pad_loop_dw_1):
-	cmpldi	cr7,r8,1
-	std	r9,0(r10)
-	addic.	r8,r8,-2
-	bne	cr7,L(zero_pad_loop_dw)
-	addi	r10,r10,8
-L(zero_pad_loop_dw_2):
-	rldicl	r5,r5,0,61
-L(zero_pad_loop_b_start):
-	cmpdi	cr7,r5,0
-	addi	r5,r5,-1
-	addi	r9,r10,-1
-	add	r10,r10,5
-	subf	r10,r9,r10
-	li	r8,0
-	beq-	cr7,L(short_path_loop_end)
-
-	/* Write remaining 1-8 bytes.  */
-        .align  4
-	addi	r9,r9,1
-	mtocrf	0x1,r10
-	bf	29,4f
-        stw     r8,0(r9)
-        addi	r9,r9,4
-
-        .align  4
-4:      bf      30,2f
-        sth     r8,0(r9)
-        addi	r9,r9,2
-
-        .align  4
-2:      bf	31,1f
-        stb	r8,0(r9)
 
-	/* Restore non-volatile registers.  */
-1:	ld	r26,-48(r1)
+	/* Restore non-volatile registers and return.  */
+	ld	r26,-48(r1)
 	ld	r27,-40(r1)
 	ld	r28,-32(r1)
 	ld	r29,-24(r1)
@@ -443,10 +436,6 @@  L(short_path_prepare_2_3):
 	mr	r4,r28
 	mr	r9,r29
 	b	L(short_path_2)
-L(zero_pad_loop_b_prepare):
-	addi	r10,r9,8
-	rldicl	r5,r5,0,61
-	b	L(zero_pad_loop_b_start)
 L(zero_pad_start_prepare_1):
 	mr	r5,r6
 	mr	r9,r8