diff mbox series

[2/2] powerpc: Optimzed stpncpy for POWER9

Message ID 20200820182917.12602-2-rzinsly@linux.ibm.com
State New
Headers show
Series [1/2] powerpc: Optimized strncpy for POWER9 | expand

Commit Message

Raphael M Zinsly Aug. 20, 2020, 6:29 p.m. UTC
Adds stpncpy support into the POWER9 strncpy.
---
 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 ++++++
 sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 74 +++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |  2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |  5 ++
 .../powerpc64/multiarch/stpncpy-power9.S      | 24 ++++++
 sysdeps/powerpc/powerpc64/multiarch/stpncpy.c |  7 ++
 6 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S

Comments

Raphael M Zinsly Aug. 20, 2020, 6:31 p.m. UTC | #1
Here is the make bench output:

                             	generic_stpncpy	__stpncpy_power9 
__stpncpy_power8	__stpncpy_power7	__stpncpy_ppc
Length   16, n   16, alignment  1/ 1:	7.31792	2.79249	2.98207	6.20964 
11.2262
Length   16, n   16, alignment  1/ 1:	7.26441	2.79883	2.97986	6.09795 
11.1118
Length   16, n   16, alignment  1/ 2:	7.22475	2.82518	2.98169	6.18967 
10.9933
Length   16, n   16, alignment  2/ 1:	7.28211	2.78851	3.1079	6.06067	10.4232
Length    2, n    4, alignment  7/ 2:	9.30193	2.4733	4.30086	4.74387	9.25328
Length    4, n    2, alignment  2/ 7:	6.7756	1.91031	2.93946	3.24475	7.76389
Length    2, n    4, alignment  7/ 2:	8.81319	2.4726	4.57341	4.74421	9.44667
Length    4, n    2, alignment  2/ 7:	6.77806	1.9118	2.93637	3.1857	7.00171
Length   16, n   16, alignment  2/ 2:	7.35335	2.80104	3.10653	5.85492 
10.5689
Length   16, n   16, alignment  2/ 2:	7.14308	2.78571	3.10889	6.10044 
10.4816
Length   16, n   16, alignment  2/ 4:	7.21628	2.81563	3.10724	6.14674 
10.6005
Length   16, n   16, alignment  4/ 2:	7.47713	2.80531	3.80081	5.86977 
9.43599
Length    4, n    8, alignment  6/ 4:	8.63537	2.4676	5.53825	4.1877	9.88309
Length    8, n    4, alignment  4/ 6:	6.63429	1.91051	3.10751	2.76472	8.4156
Length    4, n    8, alignment  6/ 4:	8.59304	2.43152	5.30288	4.16475 
9.77498
Length    8, n    4, alignment  4/ 6:	6.63843	1.91047	3.19713	2.69566 
8.67023
Length   16, n   16, alignment  3/ 3:	7.45277	2.80045	3.42433	6.06204 
9.92282
Length   16, n   16, alignment  3/ 3:	8.04191	2.78645	3.43317	5.99773 
10.0662
Length   16, n   16, alignment  3/ 6:	7.5816	2.81606	3.44168	6.0801	9.94673
Length   16, n   16, alignment  6/ 3:	7.10582	2.80176	5.03947	6.06942 
8.40249
Length    8, n   16, alignment  5/ 6:	8.19747	2.42028	4.30043	5.0752	11.3093
Length   16, n    8, alignment  6/ 5:	6.37287	2.07239	2.56322	4.36972 
6.52164
Length    8, n   16, alignment  5/ 6:	8.25022	2.45124	4.05051	5.02258 
10.8683
Length   16, n    8, alignment  6/ 5:	6.31868	2.07215	2.83061	4.44584 
7.14464
Length   16, n   16, alignment  4/ 4:	7.54408	2.80105	3.82846	5.71392 
9.91359
Length   16, n   16, alignment  4/ 4:	7.66265	2.79063	3.86233	6.06489 
9.31705
Length   16, n   16, alignment  4/ 0:	7.84286	2.79896	3.83148	6.08954 
9.55253
Length   16, n   16, alignment  0/ 4:	7.36697	2.07019	2.66533	6.13894 
7.75685
Length   16, n   32, alignment  4/ 0:	10.3819	3.33088	6.32994	7.24949 
12.3827
Length   32, n   16, alignment  0/ 4:	7.15586	2.07172	2.66097	6.11743 
7.56448
Length   16, n   32, alignment  4/ 0:	10.3262	3.35225	6.34556	7.3211	12.2527
Length   32, n   16, alignment  0/ 4:	7.13287	2.07265	2.6613	6.17878	7.61901
Length   16, n   16, alignment  5/ 5:	7.22471	2.80128	4.65776	6.15455 
9.93333
Length   16, n   16, alignment  5/ 5:	7.22458	2.78586	4.65874	6.06763 
9.87968
Length   16, n   16, alignment  5/ 2:	7.22718	2.79127	4.65999	6.025	10.3775
Length   16, n   16, alignment  2/ 5:	7.73485	2.8025	3.10754	6.08303	10.3871
Length   32, n   64, alignment  3/ 2:	13.7685	4.1256	7.04965	11.5105	15.3903
Length   64, n   32, alignment  2/ 3:	10.526	3.05149	3.59497	8.45078	13.7462
Length   32, n   64, alignment  3/ 2:	13.7681	4.11611	7.08236	11.5129 
16.6004
Length   64, n   32, alignment  2/ 3:	10.962	3.05712	3.60447	8.43981	15.4906
Length   16, n   16, alignment  6/ 6:	7.30916	2.80056	5.03985	6.16331 
8.43692
Length   16, n   16, alignment  6/ 6:	7.31688	2.7914	5.02931	6.12345	8.42848
Length   16, n   16, alignment  6/ 4:	7.7402	2.7993	5.04435	6.02685	8.28199
Length   16, n   16, alignment  4/ 6:	7.79103	2.82496	3.82464	6.0778	9.31532
Length   64, n  128, alignment  2/ 4:	15.4969	5.3714	8.09812	12.6067	18.7831
Length  128, n   64, alignment  4/ 2:	12.9023	3.93138	5.46487	10.7071 
13.3253
Length   64, n  128, alignment  2/ 4:	15.4998	5.42611	7.88843	12.6007 
24.0491
Length  128, n   64, alignment  4/ 2:	12.8971	3.94646	5.49689	11.1747 
21.5779
Length   16, n   16, alignment  7/ 7:	7.68992	2.78151	6.14775	6.19397 
8.38412
Length   16, n   16, alignment  7/ 7:	7.90811	2.7803	6.11502	6.17383	8.78371
Length   16, n   16, alignment  7/ 6:	7.45456	2.80173	5.93657	6.15191 
8.38489
Length   16, n   16, alignment  6/ 7:	7.44846	2.80238	5.03654	6.1154	8.41589
Length  128, n  256, alignment  1/ 6:	17.9114	8.39532	10.3246	17.9457 
21.9452
Length  256, n  128, alignment  6/ 1:	14.8346	5.41104	8.89047	13.5379 
17.1437
Length  128, n  256, alignment  1/ 6:	17.9118	8.39985	10.3271	17.9503 
42.0831
Length  256, n  128, alignment  6/ 1:	14.8306	5.40714	9.04492	13.5227	37.819
Length    8, n   16, alignment  0/ 0:	8.19945	2.46752	4.04264	4.62897 
8.22975
Length   32, n   16, alignment  0/ 0:	7.23617	2.07229	2.66504	2.66683 
7.93411
Length    8, n   16, alignment  7/ 2:	8.26373	2.41779	4.18003	5.31418	9.0473
Length   32, n   16, alignment  7/ 2:	7.46119	2.63992	6.16424	6.14534 
7.28237
Length   16, n   32, alignment  0/ 0:	10.1282	3.42401	5.00287	5.02318 
11.4985
Length   64, n   32, alignment  0/ 0:	9.29452	2.57779	2.79807	3.1362	10.9532
Length   16, n   32, alignment  6/ 4:	10.2194	3.30297	7.48371	10.4067 
11.2264
Length   64, n   32, alignment  6/ 4:	10.6887	3.04976	5.13062	8.10511 
11.1225
Length   32, n   64, alignment  0/ 0:	12.1806	4.09924	5.12341	6.14159 
14.0965
Length  128, n   64, alignment  0/ 0:	10.1569	3.52625	3.88528	4.65782 
11.3018
Length   32, n   64, alignment  5/ 6:	13.7795	4.13456	8.53476	10.2846 
15.1556
Length  128, n   64, alignment  5/ 6:	12.8171	3.92765	5.82505	10.3559 
15.0831
Length   64, n  128, alignment  0/ 0:	13.6328	5.33523	6.43324	7.92213 
16.4658
Length  256, n  128, alignment  0/ 0:	8.92495	4.97169	7.13044	6.30158 
12.9039
Length   64, n  128, alignment  4/ 0:	13.8393	5.36588	7.52682	11.5294 
17.5523
Length  256, n  128, alignment  4/ 0:	13.5309	5.36019	7.56527	13.3503 
17.8202
Length  128, n  256, alignment  0/ 0:	15.2956	8.14449	8.79678	9.69352 
21.2463
Length  512, n  256, alignment  0/ 0:	11.5667	7.22974	10.1355	10.2592 
21.5805
Length  128, n  256, alignment  3/ 2:	18.0152	8.21506	10.9175	20.4131 
22.3927
Length  512, n  256, alignment  3/ 2:	18.7328	7.81909	11.251	25.0633	29.2378
Length  256, n  512, alignment  0/ 0:	17.5135	13.9768	15.6849	16.1219 
30.9344
Length 1024, n  512, alignment  0/ 0:	17.988	11.8498	18.4388	18.7385	41.5762
Length  256, n  512, alignment  2/ 4:	23.3724	14.8026	15.9182	28.6762 
33.9031
Length 1024, n  512, alignment  2/ 4:	27.9562	13.2785	19.5893	46.9671 
52.4943
Length  512, n 1024, alignment  0/ 0:	23.3637	25.283	21.2536	23.4228	55.6501
Length 2048, n 1024, alignment  0/ 0:	31.303	21.2731	40.7001	38.8365	75.1105
Length  512, n 1024, alignment  1/ 6:	33.0535	26.873	24.8167	51.5917	56.236
Length 2048, n 1024, alignment  1/ 6:	47.5444	24.0206	42.5163	86.0245 
92.5819
Paul E Murphy Aug. 28, 2020, 5:04 p.m. UTC | #2
Thank you for your contributions, I have a few minor 
comments/suggestions below.

On 8/20/20 1:29 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Adds stpncpy support into the POWER9 strncpy.
s/Adds/Add/ s/into the/to/.

Likewise, s/Optimzed/Add optimized/ in the title.

> ---
>   sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 ++++++
>   sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 74 +++++++++++++++++++
>   sysdeps/powerpc/powerpc64/multiarch/Makefile  |  2 +-
>   .../powerpc64/multiarch/ifunc-impl-list.c     |  5 ++
>   .../powerpc64/multiarch/stpncpy-power9.S      | 24 ++++++
>   sysdeps/powerpc/powerpc64/multiarch/stpncpy.c |  7 ++
>   6 files changed, 135 insertions(+), 1 deletion(-)
>   create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
>   create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> new file mode 100644
> index 0000000000..a96840bb6f
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for PowerPC64/POWER9.
> +   Copyright (C) 2015-2020 Free Software Foundation, Inc.
Should this date be exclusively 2020?

> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define USE_AS_STPNCPY
> +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
> +
> +weak_alias (__stpncpy, stpncpy)
> +libc_hidden_def (__stpncpy)
> +libc_hidden_builtin_def (stpncpy)

OK.

> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> index cde68384d4..64b06a9040 100644
> --- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -18,16 +18,30 @@
> 
>   #include <sysdep.h>
> 
> +#ifdef USE_AS_STPNCPY
> +# ifndef STPNCPY
> +#   define FUNC_NAME __stpncpy
> +# else
> +#   define FUNC_NAME STPNCPY
> +# endif
> +#else
>   # ifndef STRNCPY
>   #  define FUNC_NAME strncpy
>   # else
>   #  define FUNC_NAME STRNCPY
>   # endif
> +#endif  /* !USE_AS_STPNCPY  */
> 
>   /* Implements the function
> 
>      char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> 
> +   or
> +
> +   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> +   if USE_AS_STPNCPY is defined.
> +
>      The implementation can load bytes past a null terminator, but only
>      up to the next 16B boundary, so it never crosses a page.  */
> 
> @@ -47,6 +61,13 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>   	beq	L(zero_padding_loop)
> 
>   	cmpwi	r5,0
> +#ifdef USE_AS_STPNCPY
> +	bgt	L(cont)
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
"Compute pointer to last byte copied into dest."  Likwise for the other 
copied instances.

> +	addi	r3,r3,1
> +	blr
> +#endif
OK.

>   	beqlr
This is unreachable in stpncpy, can this be conditionally included in 
the !stpncpy configuration?

> 
>   L(cont):
> @@ -77,12 +98,22 @@ L(cont):
>   	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>   	stxvl	32+v0,r11,r10	/* Partial store  */
> 
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r5
> +#endif
>   	blr
> 
>   L(null):
>   	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
>   	stxvl	32+v0,r11,r10	/* Partial store  */
> 
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r7
> +#endif
>   	add	r11,r11,r8
>   	sub	r5,r5,r8
>   	b L(zero_padding_loop)
> @@ -164,6 +195,11 @@ L(n_tail4):
>   	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>   	addi	r11,r11,48	/* Offset */
>   	stxvl	32+v3,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r5
> +#endif
>   	blr
> 
>   L(prep_n_tail1):
> @@ -174,6 +210,11 @@ L(prep_n_tail1):
>   L(n_tail1):
>   	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>   	stxvl	32+v0,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r5
> +#endif
>   	blr
> 
>   L(prep_n_tail2):
> @@ -186,6 +227,11 @@ L(n_tail2):
>   	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>   	addi	r11,r11,16	/* offset */
>   	stxvl	32+v1,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r5
> +#endif
>   	blr
> 
>   L(prep_n_tail3):
> @@ -199,6 +245,11 @@ L(n_tail3):
>   	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>   	addi	r11,r11,32	/* Offset */
>   	stxvl	32+v2,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r5
> +#endif
>   	blr
> 
>   L(prep_tail1):
> @@ -208,6 +259,11 @@ L(tail1):
>   	addi	r9,r8,1	/* Add null terminator  */
>   	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>   	stxvl	32+v0,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r8
> +#endif
>   	add	r11,r11,r9
>   	sub	r5,r5,r9
>   	b L(zero_padding_loop)
> @@ -222,6 +278,11 @@ L(tail2):
>   	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>   	addi	r11,r11,16	/* offset */
>   	stxvl	32+v1,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r8
> +#endif
>   	add	r11,r11,r9
>   	sub	r5,r5,r9
>   	b L(zero_padding_loop)
> @@ -237,6 +298,11 @@ L(tail3):
>   	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>   	addi	r11,r11,32	/* offset */
>   	stxvl	32+v2,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r8
> +#endif
>   	add	r11,r11,r9
>   	sub	r5,r5,r9
>   	b L(zero_padding_loop)
> @@ -252,6 +318,11 @@ L(tail4):
>   	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>   	addi	r11,r11,48	/* offset */
>   	stxvl	32+v3,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r8
> +#endif
>   	add	r11,r11,r9
>   	sub	r5,r5,r9
> 
> @@ -274,3 +345,6 @@ L(zero_padding_end):
>   L(n_tail):
> 
>   END (FUNC_NAME)
> +#ifndef USE_AS_STPNCPY
> +libc_hidden_builtin_def (strncpy)
> +#endif
OK.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index cd2b47b403..f46bf50732 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile

OK.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index aa63e1c23f..56790bcfe3 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c

OK.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> new file mode 100644
> index 0000000000..ecbbb5c8e9
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for POWER9/PPC64.
> +   Copyright (C) 2015-2020 Free Software Foundation, Inc.
Minor nit, I suspect that date should only include 2020.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define STPNCPY __stpncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
OK.

> +#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> index 17df886431..21702716a3 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> @@ -26,10 +26,17 @@
>   extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
>   extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
>   extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
> +# endif
>   # undef stpncpy
>   # undef __stpncpy
> 
>   libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
> +# ifdef __LITTLE_ENDIAN__
> +		   (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +		   ? __stpncpy_power9 :
> +# endif
>   		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
>   		       ? __stpncpy_power8
>   		       : (hwcap & PPC_FEATURE_HAS_VSX)
> 
I think the spacing is off by two here.
diff mbox series

Patch

diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
new file mode 100644
index 0000000000..a96840bb6f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
@@ -0,0 +1,24 @@ 
+/* Optimized stpncpy implementation for PowerPC64/POWER9.
+   Copyright (C) 2015-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
index cde68384d4..64b06a9040 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -18,16 +18,30 @@ 
 
 #include <sysdep.h>
 
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+#   define FUNC_NAME __stpncpy
+# else
+#   define FUNC_NAME STPNCPY
+# endif
+#else
 # ifndef STRNCPY
 #  define FUNC_NAME strncpy
 # else
 #  define FUNC_NAME STRNCPY
 # endif
+#endif  /* !USE_AS_STPNCPY  */
 
 /* Implements the function
 
    char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
 
+   or
+
+   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   if USE_AS_STPNCPY is defined.
+
    The implementation can load bytes past a null terminator, but only
    up to the next 16B boundary, so it never crosses a page.  */
 
@@ -47,6 +61,13 @@  ENTRY_TOCLESS (FUNC_NAME, 4)
 	beq	L(zero_padding_loop)
 
 	cmpwi	r5,0
+#ifdef USE_AS_STPNCPY
+	bgt	L(cont)
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	addi	r3,r3,1
+	blr
+#endif
 	beqlr
 
 L(cont):
@@ -77,12 +98,22 @@  L(cont):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
 
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(null):
 	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
 
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r7
+#endif
 	add	r11,r11,r8
 	sub	r5,r5,r8
 	b L(zero_padding_loop)
@@ -164,6 +195,11 @@  L(n_tail4):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,48	/* Offset */
 	stxvl	32+v3,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail1):
@@ -174,6 +210,11 @@  L(prep_n_tail1):
 L(n_tail1):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail2):
@@ -186,6 +227,11 @@  L(n_tail2):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,16	/* offset */
 	stxvl	32+v1,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail3):
@@ -199,6 +245,11 @@  L(n_tail3):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,32	/* Offset */
 	stxvl	32+v2,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_tail1):
@@ -208,6 +259,11 @@  L(tail1):
 	addi	r9,r8,1	/* Add null terminator  */
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding_loop)
@@ -222,6 +278,11 @@  L(tail2):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,16	/* offset */
 	stxvl	32+v1,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding_loop)
@@ -237,6 +298,11 @@  L(tail3):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,32	/* offset */
 	stxvl	32+v2,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding_loop)
@@ -252,6 +318,11 @@  L(tail4):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,48	/* offset */
 	stxvl	32+v3,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 
@@ -274,3 +345,6 @@  L(zero_padding_end):
 L(n_tail):
 
 END (FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index cd2b47b403..f46bf50732 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@  sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
-		   rawmemchr-power9 strlen-power9 strncpy-power9
+		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index aa63e1c23f..56790bcfe3 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -317,6 +317,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __stpncpy_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __stpncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
new file mode 100644
index 0000000000..ecbbb5c8e9
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
@@ -0,0 +1,24 @@ 
+/* Optimized stpncpy implementation for POWER9/PPC64.
+   Copyright (C) 2015-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define STPNCPY __stpncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index 17df886431..21702716a3 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -26,10 +26,17 @@ 
 extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
+# endif
 # undef stpncpy
 # undef __stpncpy
 
 libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
+# ifdef __LITTLE_ENDIAN__
+		   (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+		   ? __stpncpy_power9 :
+# endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __stpncpy_power8
 		       : (hwcap & PPC_FEATURE_HAS_VSX)