diff mbox series

[v2,2/2] powerpc: Add optimized stpncpy for POWER9

Message ID 20200904165653.16202-2-rzinsly@linux.ibm.com
State New
Headers show
Series [v2,1/2] powerpc: Add optimized strncpy for POWER9 | expand

Commit Message

Raphael M Zinsly Sept. 4, 2020, 4:56 p.m. UTC
Add stpncpy support into the POWER9 strncpy.
---
 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 +++++++
 sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 65 +++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |  2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |  5 ++
 .../powerpc64/multiarch/stpncpy-power9.S      | 24 +++++++
 sysdeps/powerpc/powerpc64/multiarch/stpncpy.c |  7 ++
 6 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S

Comments

Raphael M Zinsly Sept. 4, 2020, 4:59 p.m. UTC | #1
Benchtest output:
                             	generic_stpncpy	__stpncpy_power9 
__stpncpy_power8	__stpncpy_power7	__stpncpy_ppc
Length   16, n   16, alignment  1/ 1:	6.55566	2.5481	2.74063	5.28665	9.96288
Length   16, n   16, alignment  1/ 1:	6.70016	2.54137	2.7108	4.77502	9.91703
Length   16, n   16, alignment  1/ 2:	6.55975	2.56295	2.70641	5.49298 
9.59591
Length   16, n   16, alignment  2/ 1:	6.90759	2.52713	2.854	5.48949	9.37664
Length    2, n    4, alignment  7/ 2:	7.90969	2.22698	3.90151	4.6461	8.4503
Length    4, n    2, alignment  2/ 7:	6.14855	1.73403	2.67338	3.05675 
6.86316
Length    2, n    4, alignment  7/ 2:	8.40868	2.22338	4.50838	4.51078 
9.28489
Length    4, n    2, alignment  2/ 7:	6.14849	1.73402	2.67225	2.85349 
6.34342
Length   16, n   16, alignment  2/ 2:	6.963	2.54442	2.87779	5.63547	9.85162
Length   16, n   16, alignment  2/ 2:	6.59452	2.54121	2.84662	5.57178 
9.51406
Length   16, n   16, alignment  2/ 4:	6.79115	2.55835	2.84836	5.50427 
9.67999
Length   16, n   16, alignment  4/ 2:	6.78419	2.54132	3.54229	5.52563 
8.50938
Length    4, n    8, alignment  6/ 4:	8.45703	2.17266	4.80507	3.8714	9.04725
Length    8, n    4, alignment  4/ 6:	6.01753	1.73761	2.8185	2.41527	8.00051
Length    4, n    8, alignment  6/ 4:	7.82081	2.22612	4.80057	3.76103 
8.99812
Length    8, n    4, alignment  4/ 6:	6.01752	1.73474	2.82089	2.41524 
7.82703
Length   16, n   16, alignment  3/ 3:	6.78194	2.54143	3.21392	5.46447 
8.90749
Length   16, n   16, alignment  3/ 3:	6.76324	2.54088	3.22883	5.39689 
9.14749
Length   16, n   16, alignment  3/ 6:	7.05278	2.55795	3.22243	5.53422 
9.11315
Length   16, n   16, alignment  6/ 3:	6.72881	2.54183	4.58459	5.51658 
7.85006
Length    8, n   16, alignment  5/ 6:	7.67184	2.23969	4.13269	4.90728 
10.2248
Length   16, n    8, alignment  6/ 5:	5.73672	1.88048	2.6693	4.35579	6.11674
Length    8, n   16, alignment  5/ 6:	7.51707	2.2284	3.67276	4.90637	10.2411
Length   16, n    8, alignment  6/ 5:	5.73665	1.88119	2.57514	3.96351 
6.16253
Length   16, n   16, alignment  4/ 4:	7.03577	2.5415	3.66445	4.94157	8.98371
Length   16, n   16, alignment  4/ 4:	6.93549	2.53033	3.65577	5.53815 
8.48335
Length   16, n   16, alignment  4/ 0:	6.95106	2.53483	3.48744	5.43759 
8.45425
Length   16, n   16, alignment  0/ 4:	6.44601	1.87936	2.41984	5.49488 
6.92169
Length   16, n   32, alignment  4/ 0:	9.2036	3.04122	5.78685	6.66434	10.9065
Length   32, n   16, alignment  0/ 4:	6.65504	1.87934	2.41817	6.08706 
6.98513
Length   16, n   32, alignment  4/ 0:	9.17461	3.04153	5.77758	6.66444 
10.8015
Length   32, n   16, alignment  0/ 4:	6.44123	1.87936	2.41847	5.55207 
6.86039
Length   16, n   16, alignment  5/ 5:	6.56005	2.53132	4.22362	5.43527 
9.25109
Length   16, n   16, alignment  5/ 5:	6.55552	2.53088	4.22655	5.59271 
9.61369
Length   16, n   16, alignment  5/ 2:	6.55553	2.54559	4.31135	5.47438 
8.83103
Length   16, n   16, alignment  2/ 5:	6.88992	2.56255	2.84059	5.23185 
9.51441
Length   32, n   64, alignment  3/ 2:	12.5054	3.75138	6.42457	10.4719 
15.0663
Length   64, n   32, alignment  2/ 3:	9.87185	2.78283	3.17042	7.66624	11.503
Length   32, n   64, alignment  3/ 2:	12.4999	3.74537	6.38161	10.4578 
15.1104
Length   64, n   32, alignment  2/ 3:	9.86495	2.77889	3.19171	7.63272 
13.9799
Length   16, n   16, alignment  6/ 6:	6.41353	2.5453	4.50915	5.30382	8.45391
Length   16, n   16, alignment  6/ 6:	6.49495	2.54119	4.54493	5.55909	8.1629
Length   16, n   16, alignment  6/ 4:	6.41743	2.54487	4.57202	4.98659 
7.53033
Length   16, n   16, alignment  4/ 6:	6.91724	2.54649	3.67868	5.36838 
8.45677
Length   64, n  128, alignment  2/ 4:	14.0687	4.93151	8.11667	11.4411 
16.9533
Length  128, n   64, alignment  4/ 2:	11.7134	3.58948	4.90121	10.3018 
11.6692
Length   64, n  128, alignment  2/ 4:	14.0677	4.93413	7.28129	11.439	22.2186
Length  128, n   64, alignment  4/ 2:	11.7149	3.59312	4.85286	10.3403 
19.4651
Length   16, n   16, alignment  7/ 7:	6.76501	2.52563	5.55792	5.44155 
8.39997
Length   16, n   16, alignment  7/ 7:	7.16923	2.5265	5.55148	5.60184	7.98311
Length   16, n   16, alignment  7/ 6:	6.76252	2.52629	5.48067	5.51161 
7.61026
Length   16, n   16, alignment  6/ 7:	6.65772	2.5521	4.55758	5.48893	7.7301
Length  128, n  256, alignment  1/ 6:	16.2494	7.62034	9.3616	16.2888	19.7029
Length  256, n  128, alignment  6/ 1:	13.4311	4.94455	8.10802	12.2681 
15.6941
Length  128, n  256, alignment  1/ 6:	16.2608	7.6209	9.35509	16.2856	38.0277
Length  256, n  128, alignment  6/ 1:	13.4327	4.89474	8.35934	12.2646 
34.3268
Length    8, n   16, alignment  0/ 0:	7.20671	2.23256	3.75778	5.63555 
7.36414
Length   32, n   16, alignment  0/ 0:	6.4449	1.88	2.41577	2.89598	6.42537
Length    8, n   16, alignment  7/ 2:	7.45976	2.21832	3.91671	4.6524	8.45825
Length   32, n   16, alignment  7/ 2:	6.78267	2.34296	5.59161	5.58598 
6.88842
Length   16, n   32, alignment  0/ 0:	9.47971	3.10847	4.74758	4.75377 
10.2238
Length   64, n   32, alignment  0/ 0:	8.45634	2.34747	2.59248	2.82356 
9.42305
Length   16, n   32, alignment  6/ 4:	9.37784	3.05067	6.92384	9.47727 
10.1826
Length   64, n   32, alignment  6/ 4:	9.89233	2.77968	4.63672	7.09838 
10.2804
Length   32, n   64, alignment  0/ 0:	11.0813	3.71086	4.43777	5.3549	12.2048
Length  128, n   64, alignment  0/ 0:	9.25192	3.20123	3.53388	4.50794 
10.1934
Length   32, n   64, alignment  5/ 6:	12.5099	3.75871	7.29613	9.64902 
13.5821
Length  128, n   64, alignment  5/ 6:	11.6115	3.60165	5.71818	9.07288 
12.7929
Length   64, n  128, alignment  0/ 0:	12.3671	4.80754	5.46926	6.84492 
14.9238
Length  256, n  128, alignment  0/ 0:	8.08427	4.52607	6.47996	5.92086	11.701
Length   64, n  128, alignment  4/ 0:	12.5692	4.89717	7.11058	10.472	15.875
Length  256, n  128, alignment  4/ 0:	12.2945	4.94163	7.11645	12.3831 
16.6219
Length  128, n  256, alignment  0/ 0:	13.8948	7.28911	7.78784	9.30215 
17.0358
Length  512, n  256, alignment  0/ 0:	10.5266	6.56481	9.14202	9.31096 
20.0531
Length  128, n  256, alignment  3/ 2:	16.3534	7.46332	9.90009	18.5282 
19.5969
Length  512, n  256, alignment  3/ 2:	17.0519	7.09947	10.1635	23.5411 
25.0043
Length  256, n  512, alignment  0/ 0:	15.8935	12.6195	14.0756	14.7553 
28.5299
Length 1024, n  512, alignment  0/ 0:	16.3758	10.8028	16.5447	16.8966 
37.8653
Length  256, n  512, alignment  2/ 4:	21.16	13.2779	14.3088	26.4475	30.1647
Length 1024, n  512, alignment  2/ 4:	25.3364	12.0899	17.5443	42.7216 
47.5803
Length  512, n 1024, alignment  0/ 0:	20.5111	22.9782	19.6648	21.3857 
42.4801
Length 2048, n 1024, alignment  0/ 0:	28.4023	19.1577	36.9065	35.4799 
68.3555
Length  512, n 1024, alignment  1/ 6:	29.9694	24.3087	22.0513	46.7436 
51.5908
Length 2048, n 1024, alignment  1/ 6:	42.9897	21.5402	38.739	78.3266	84.3956
Matheus Castanho Sept. 16, 2020, 12:32 p.m. UTC | #2
On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote:
> Benchtest output:
>                                 generic_stpncpy    __stpncpy_power9  __stpncpy_power8    __stpncpy_power7    __stpncpy_ppc
<snip>
> Length  512, n 1024, alignment  0/ 0:    20.5111    22.9782   19.6648    21.3857 42.4801
<snip>
> Length  512, n 1024, alignment  1/ 6:    29.9694    24.3087   22.0513    46.7436 51.5908

These two seem to be the only cases in which the power9 version loses to
the power8 one. Have you investigated what happens in these two specific
cases?
Matheus Castanho Sept. 16, 2020, 12:35 p.m. UTC | #3
On 9/4/20 1:56 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Add stpncpy support into the POWER9 strncpy.
> ---
>  sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 +++++++
>  sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 65 +++++++++++++++++++
>  sysdeps/powerpc/powerpc64/multiarch/Makefile  |  2 +-
>  .../powerpc64/multiarch/ifunc-impl-list.c     |  5 ++
>  .../powerpc64/multiarch/stpncpy-power9.S      | 24 +++++++
>  sysdeps/powerpc/powerpc64/multiarch/stpncpy.c |  7 ++
>  6 files changed, 126 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
>  create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> new file mode 100644
> index 0000000000..81d9673d8b
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for POWER9 LE.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define USE_AS_STPNCPY
> +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
> +
> +weak_alias (__stpncpy, stpncpy)
> +libc_hidden_def (__stpncpy)
> +libc_hidden_builtin_def (stpncpy)
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> index 34fcdee913..f7265b11ec 100644
> --- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -18,16 +18,30 @@
>  
>  #include <sysdep.h>
>  
> +#ifdef USE_AS_STPNCPY
> +# ifndef STPNCPY
> +#   define FUNC_NAME __stpncpy
> +# else
> +#   define FUNC_NAME STPNCPY
> +# endif
> +#else
>  # ifndef STRNCPY
>  #  define FUNC_NAME strncpy
>  # else
>  #  define FUNC_NAME STRNCPY
>  # endif
> +#endif  /* !USE_AS_STPNCPY  */
>  
>  /* Implements the function
>  
>     char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
>  
> +   or
> +
> +   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> +   if USE_AS_STPNCPY is defined.
> +
>     The implementation can load bytes past a null terminator, but only
>     up to the next 16-byte aligned address, so it never crosses a page.  */
>  
> @@ -49,7 +63,15 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>  
>  	/* Empty/1-byte string optimization  */
>  	cmpdi	r5,0
> +#ifdef USE_AS_STPNCPY
> +	bgt	L(cont)
> +	/* Compute pointer to last byte copied into dest.  */
> +	addi	r3,r3,1
> +	blr
> +L(cont):
> +#else
>  	beqlr
> +#endif
>  
>  	addi	r4,r4,1
>  	neg	r7,r4
> @@ -79,12 +101,20 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>  	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>  	stxvl	32+v0,r11,r10	/* Partial store  */
>  
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r5
> +#endif
>  	blr
>  
>  L(null):
>  	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
>  	stxvl	32+v0,r11,r10	/* Partial store  */
>  
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r7
> +#endif
>  	add	r11,r11,r8
>  	sub	r5,r5,r8
>  	b L(zero_padding_loop)
> @@ -168,6 +198,10 @@ L(n_tail4):
>  	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>  	addi	r11,r11,48	/* Offset */
>  	stxvl	32+v3,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r5
> +#endif
>  	blr
>  
>  L(prep_n_tail1):
> @@ -179,6 +213,10 @@ L(prep_n_tail1):
>  L(n_tail1):
>  	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>  	stxvl	32+v0,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r5
> +#endif
>  	blr
>  
>  L(prep_n_tail2):
> @@ -192,6 +230,10 @@ L(n_tail2):
>  	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>  	addi	r11,r11,16	/* offset */
>  	stxvl	32+v1,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r5
> +#endif
>  	blr
>  
>  L(prep_n_tail3):
> @@ -206,6 +248,10 @@ L(n_tail3):
>  	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>  	addi	r11,r11,32	/* Offset */
>  	stxvl	32+v2,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r5
> +#endif
>  	blr
>  
>  L(prep_tail1):
> @@ -215,6 +261,10 @@ L(tail1):
>  	addi	r9,r8,1		/* Add null terminator  */
>  	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>  	stxvl	32+v0,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r8
> +#endif
>  	add	r11,r11,r9
>  	sub	r5,r5,r9
>  	b L(zero_padding_loop)
> @@ -229,6 +279,10 @@ L(tail2):
>  	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>  	addi	r11,r11,16	/* offset */
>  	stxvl	32+v1,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r8
> +#endif
>  	add	r11,r11,r9
>  	sub	r5,r5,r9
>  	b L(zero_padding_loop)
> @@ -244,6 +298,10 @@ L(tail3):
>  	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>  	addi	r11,r11,32	/* offset */
>  	stxvl	32+v2,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r8
> +#endif
>  	add	r11,r11,r9
>  	sub	r5,r5,r9
>  	b L(zero_padding_loop)
> @@ -259,6 +317,10 @@ L(tail4):
>  	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>  	addi	r11,r11,48	/* offset */
>  	stxvl	32+v3,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r8
> +#endif
>  	add	r11,r11,r9
>  	sub	r5,r5,r9
>  
> @@ -279,3 +341,6 @@ L(zero_padding_end):
>  	blr
>  
>  END (FUNC_NAME)
> +#ifndef USE_AS_STPNCPY
> +libc_hidden_builtin_def (strncpy)
> +#endif
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index cd2b47b403..f46bf50732 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
>  
>  ifneq (,$(filter %le,$(config-machine)))
>  sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
> -		   rawmemchr-power9 strlen-power9 strncpy-power9
> +		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
>  endif
>  CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
>  CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index aa63e1c23f..56790bcfe3 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
>    IFUNC_IMPL (i, name, stpncpy,
> +#ifdef __LITTLE_ENDIAN__
> +	      IFUNC_IMPL_ADD (array, i, stpncpy,
> +			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
> +			      __stpncpy_power9)
> +#endif
>  	      IFUNC_IMPL_ADD (array, i, stpncpy,
>  			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
>  			      __stpncpy_power8)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> new file mode 100644
> index 0000000000..ccbab55c31
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for POWER9 LE.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define STPNCPY __stpncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> index 17df886431..ac17b26650 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> @@ -26,10 +26,17 @@
>  extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
>  extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
>  extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
> +# endif
>  # undef stpncpy
>  # undef __stpncpy
>  
>  libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
> +# ifdef __LITTLE_ENDIAN__
> +		     (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +		     ? __stpncpy_power9 :
> +# endif
>  		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
>  		       ? __stpncpy_power8
>  		       : (hwcap & PPC_FEATURE_HAS_VSX)
> 

LGTM.

Reviewed-by: Matheus Castanho <msc@linux.ibm.com>

--
Matheus Castanho
Raphael M Zinsly Sept. 16, 2020, 12:56 p.m. UTC | #4
Hi Matheus,

On 16/09/2020 09:32, Matheus Castanho wrote:
> On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote:
>> Benchtest output:
>>                                  generic_stpncpy    __stpncpy_power9  __stpncpy_power8    __stpncpy_power7    __stpncpy_ppc
> <snip>
>> Length  512, n 1024, alignment  0/ 0:    20.5111    22.9782   19.6648    21.3857 42.4801
> <snip>
>> Length  512, n 1024, alignment  1/ 6:    29.9694    24.3087   22.0513    46.7436 51.5908
> 
> These two seem to be the only cases in which the power9 version loses to
> the power8 one. Have you investigated what happens in these two specific
> cases?
> 
Yes the power8 optimization calls memset to do the zero padding at the 
end if n > length. In this case where n is way higher, memset is faster 
than the loop used in my implementation.


Thanks for the review!

Regards,
Paul A. Clarke Sept. 18, 2020, 3:53 p.m. UTC | #5
On Wed, Sep 16, 2020 at 09:56:59AM -0300, Raphael M Zinsly via Libc-alpha wrote:
> On 16/09/2020 09:32, Matheus Castanho wrote:
> > On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote:
> > > Benchtest output:
> > >                                  generic_stpncpy    __stpncpy_power9  __stpncpy_power8    __stpncpy_power7    __stpncpy_ppc
> > <snip>
> > > Length  512, n 1024, alignment  0/ 0:    20.5111    22.9782   19.6648    21.3857 42.4801
> > <snip>
> > > Length  512, n 1024, alignment  1/ 6:    29.9694    24.3087   22.0513    46.7436 51.5908
> > 
> > These two seem to be the only cases in which the power9 version loses to
> > the power8 one. Have you investigated what happens in these two specific
> > cases?
> > 
> Yes the power8 optimization calls memset to do the zero padding at the end
> if n > length. In this case where n is way higher, memset is faster than the
> loop used in my implementation.

Is there some sort of threshold that would help these cases by transitioning
to memset (or replicating the relevant part of that code here?

PC
diff mbox series

Patch

diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
new file mode 100644
index 0000000000..81d9673d8b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
@@ -0,0 +1,24 @@ 
+/* Optimized stpncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
index 34fcdee913..f7265b11ec 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -18,16 +18,30 @@ 
 
 #include <sysdep.h>
 
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+#   define FUNC_NAME __stpncpy
+# else
+#   define FUNC_NAME STPNCPY
+# endif
+#else
 # ifndef STRNCPY
 #  define FUNC_NAME strncpy
 # else
 #  define FUNC_NAME STRNCPY
 # endif
+#endif  /* !USE_AS_STPNCPY  */
 
 /* Implements the function
 
    char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
 
+   or
+
+   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   if USE_AS_STPNCPY is defined.
+
    The implementation can load bytes past a null terminator, but only
    up to the next 16-byte aligned address, so it never crosses a page.  */
 
@@ -49,7 +63,15 @@  ENTRY_TOCLESS (FUNC_NAME, 4)
 
 	/* Empty/1-byte string optimization  */
 	cmpdi	r5,0
+#ifdef USE_AS_STPNCPY
+	bgt	L(cont)
+	/* Compute pointer to last byte copied into dest.  */
+	addi	r3,r3,1
+	blr
+L(cont):
+#else
 	beqlr
+#endif
 
 	addi	r4,r4,1
 	neg	r7,r4
@@ -79,12 +101,20 @@  ENTRY_TOCLESS (FUNC_NAME, 4)
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
 
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(null):
 	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
 
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r7
+#endif
 	add	r11,r11,r8
 	sub	r5,r5,r8
 	b L(zero_padding_loop)
@@ -168,6 +198,10 @@  L(n_tail4):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,48	/* Offset */
 	stxvl	32+v3,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail1):
@@ -179,6 +213,10 @@  L(prep_n_tail1):
 L(n_tail1):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail2):
@@ -192,6 +230,10 @@  L(n_tail2):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,16	/* offset */
 	stxvl	32+v1,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail3):
@@ -206,6 +248,10 @@  L(n_tail3):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,32	/* Offset */
 	stxvl	32+v2,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_tail1):
@@ -215,6 +261,10 @@  L(tail1):
 	addi	r9,r8,1		/* Add null terminator  */
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding_loop)
@@ -229,6 +279,10 @@  L(tail2):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,16	/* offset */
 	stxvl	32+v1,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding_loop)
@@ -244,6 +298,10 @@  L(tail3):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,32	/* offset */
 	stxvl	32+v2,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding_loop)
@@ -259,6 +317,10 @@  L(tail4):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,48	/* offset */
 	stxvl	32+v3,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 
@@ -279,3 +341,6 @@  L(zero_padding_end):
 	blr
 
 END (FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index cd2b47b403..f46bf50732 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@  sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
-		   rawmemchr-power9 strlen-power9 strncpy-power9
+		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index aa63e1c23f..56790bcfe3 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -317,6 +317,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __stpncpy_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __stpncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
new file mode 100644
index 0000000000..ccbab55c31
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
@@ -0,0 +1,24 @@ 
+/* Optimized stpncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define STPNCPY __stpncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index 17df886431..ac17b26650 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -26,10 +26,17 @@ 
 extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
+# endif
 # undef stpncpy
 # undef __stpncpy
 
 libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
+# ifdef __LITTLE_ENDIAN__
+		     (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+		     ? __stpncpy_power9 :
+# endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __stpncpy_power8
 		       : (hwcap & PPC_FEATURE_HAS_VSX)