diff mbox series

[1/2] powerpc: Optimized strncpy for POWER9

Message ID 20200820182917.12602-1-rzinsly@linux.ibm.com
State New
Headers show
Series [1/2] powerpc: Optimized strncpy for POWER9 | expand

Commit Message

Raphael M Zinsly Aug. 20, 2020, 6:29 p.m. UTC
Similar to the strcpy P9 optimization, this version uses VSX to improve
performance.
---
 sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 276 ++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |   5 +
 .../powerpc64/multiarch/strncpy-power9.S      |  26 ++
 sysdeps/powerpc/powerpc64/multiarch/strncpy.c |   7 +
 5 files changed, 315 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S

Comments

Raphael M Zinsly Aug. 20, 2020, 6:31 p.m. UTC | #1
Here is the make bench output:

                             	generic_strncpy	__strncpy_power9 
__strncpy_power8	__strncpy_power7	__strncpy_ppc
Length   16, n   16, alignment  1/ 1:	7.11694	2.77348	2.80296	6.5724	10.4471
Length   16, n   16, alignment  1/ 1:	7.1557	2.75968	2.805	6.5748	10.5064
Length   16, n   16, alignment  1/ 2:	7.17956	2.79127	2.79964	6.57323 
10.3281
Length   16, n   16, alignment  2/ 1:	7.15841	2.77364	3.10582	6.2332	10.331
Length    2, n    4, alignment  7/ 2:	8.90911	2.4623	5.38449	5.64873	9.36348
Length    4, n    2, alignment  2/ 7:	6.65395	1.84558	2.58298	3.10566 
7.46376
Length    2, n    4, alignment  7/ 2:	8.70625	2.41166	5.38131	5.73421 
9.64285
Length    4, n    2, alignment  2/ 7:	6.65458	1.84354	2.58382	3.64721 
6.96163
Length   16, n   16, alignment  2/ 2:	7.01778	2.77373	3.10668	6.58047 
10.6006
Length   16, n   16, alignment  2/ 2:	7.53778	2.75789	3.10591	6.2277	10.2613
Length   16, n   16, alignment  2/ 4:	7.13828	2.79132	3.10567	6.56847	10.619
Length   16, n   16, alignment  4/ 2:	7.38659	2.77668	3.70851	6.54537 
9.17368
Length    4, n    8, alignment  6/ 4:	8.71748	2.45183	5.76669	4.65782	10.014
Length    8, n    4, alignment  4/ 6:	6.5504	1.83463	2.96574	2.66227	8.49964
Length    4, n    8, alignment  6/ 4:	8.96461	2.4499	5.78384	5.32287	9.79641
Length    8, n    4, alignment  4/ 6:	6.48083	1.83265	2.9783	3.38632	8.51888
Length   16, n   16, alignment  3/ 3:	7.7538	2.77353	3.29008	6.55912	9.94143
Length   16, n   16, alignment  3/ 3:	7.75279	2.76148	3.30616	6.5445	9.98866
Length   16, n   16, alignment  3/ 6:	7.21486	2.79444	3.33712	6.24747	10.113
Length   16, n   16, alignment  6/ 3:	6.99138	2.77778	4.50777	6.22522 
8.53482
Length    8, n   16, alignment  5/ 6:	8.26994	2.77966	4.60681	6.10938 
10.5975
Length   16, n    8, alignment  6/ 5:	6.28062	2.07193	2.57761	4.95636 
6.48035
Length    8, n   16, alignment  5/ 6:	8.17113	2.43559	4.27753	5.95453 
11.1796
Length   16, n    8, alignment  6/ 5:	6.21214	2.07239	2.57714	4.96762 
6.76041
Length   16, n   16, alignment  4/ 4:	7.31373	2.77573	3.78349	6.19349 
8.91432
Length   16, n   16, alignment  4/ 4:	7.32226	2.75658	3.70319	6.60792 
9.17307
Length   16, n   16, alignment  4/ 0:	7.58812	2.76841	3.71554	6.54282 
8.90051
Length   16, n   16, alignment  0/ 4:	6.92871	2.06944	2.66876	6.63947 
8.08171
Length   16, n   32, alignment  4/ 0:	10.2972	3.3192	6.53695	7.77295	12.332
Length   32, n   16, alignment  0/ 4:	6.98056	2.06954	2.66445	6.54976 
7.65286
Length   16, n   32, alignment  4/ 0:	10.5356	3.31343	6.53813	7.72029 
12.2915
Length   32, n   16, alignment  0/ 4:	7.36068	2.06945	2.66424	6.21052 
8.07614
Length   16, n   16, alignment  5/ 5:	7.2122	2.77732	4.17451	6.55383	10.4887
Length   16, n   16, alignment  5/ 5:	7.34438	2.77512	4.17191	6.56873 
10.5664
Length   16, n   16, alignment  5/ 2:	7.15746	2.76198	4.14481	6.56235 
10.7391
Length   16, n   16, alignment  2/ 5:	7.19372	2.79273	3.10693	6.56984 
10.2697
Length   32, n   64, alignment  3/ 2:	15.3918	4.22964	7.0146	12.5809	13.8661
Length   64, n   32, alignment  2/ 3:	10.5331	3.02942	3.54253	9.19106 
12.9356
Length   32, n   64, alignment  3/ 2:	15.369	4.17282	7.36163	12.5759	16.8501
Length   64, n   32, alignment  2/ 3:	10.5585	3.01971	3.52885	9.03369 
15.6663
Length   16, n   16, alignment  6/ 6:	7.0405	2.77527	4.53842	6.54733	7.99437
Length   16, n   16, alignment  6/ 6:	7.02801	2.76059	4.52873	6.53536 
8.45713
Length   16, n   16, alignment  6/ 4:	7.42011	2.77669	4.52223	6.57756	7.9899
Length   16, n   16, alignment  4/ 6:	7.37787	2.77507	3.77821	6.57058 
9.17396
Length   64, n  128, alignment  2/ 4:	17.188	5.33493	8.00394	12.6196	19.1784
Length  128, n   64, alignment  4/ 2:	12.7962	3.91004	5.42994	11.294	12.5273
Length   64, n  128, alignment  2/ 4:	17.2298	5.2748	8.15392	12.6039	24.3802
Length  128, n   64, alignment  4/ 2:	12.7866	3.87534	5.3334	11.8516	21.6528
Length   16, n   16, alignment  7/ 7:	7.75015	2.76775	5.59024	6.57976 
8.42318
Length   16, n   16, alignment  7/ 7:	7.81681	2.75691	5.56801	6.55397 
10.0378
Length   16, n   16, alignment  7/ 6:	7.75225	2.77446	5.56813	6.57349 
8.49645
Length   16, n   16, alignment  6/ 7:	7.23237	2.79186	4.51528	6.55304 
8.63443
Length  128, n  256, alignment  1/ 6:	19.8414	8.37691	10.3445	18.4838 
22.8314
Length  256, n  128, alignment  6/ 1:	14.7972	5.38498	8.83611	13.8521 
16.6154
Length  128, n  256, alignment  1/ 6:	19.8497	8.37754	10.3469	18.2655 
43.3568
Length  256, n  128, alignment  6/ 1:	14.7542	5.31075	8.75314	13.7759 
37.6351
Length    8, n   16, alignment  0/ 0:	8.19872	2.45818	4.27602	4.6578	7.98513
Length   32, n   16, alignment  0/ 0:	6.92066	2.07115	2.66465	2.66381 
7.75655
Length    8, n   16, alignment  7/ 2:	8.18253	2.42685	4.70317	6.01808 
9.35743
Length   32, n   16, alignment  7/ 2:	7.79714	2.60074	5.58717	6.64181 
6.98583
Length   16, n   32, alignment  0/ 0:	10.4715	3.40184	6.28388	4.86146 
11.0819
Length   64, n   32, alignment  0/ 0:	10.4403	2.54135	3.07109	3.38791 
9.35196
Length   16, n   32, alignment  6/ 4:	10.7077	3.34867	7.01321	10.4278 
11.2951
Length   64, n   32, alignment  6/ 4:	10.9215	3.03041	5.04324	8.30023 
11.2648
Length   32, n   64, alignment  0/ 0:	12.0062	4.09428	5.32372	5.48319 
14.1455
Length  128, n   64, alignment  0/ 0:	10.1803	3.47282	3.83134	4.21557 
10.6674
Length   32, n   64, alignment  5/ 6:	15.4165	4.16297	7.78876	10.8762 
15.4308
Length  128, n   64, alignment  5/ 6:	12.7332	3.91667	5.8014	10.5869	14.0961
Length   64, n  128, alignment  0/ 0:	13.238	5.24242	6.90661	8.05566	15.9848
Length  256, n  128, alignment  0/ 0:	8.759	4.9483	6.98675	6.11489	12.6755
Length   64, n  128, alignment  4/ 0:	13.6593	5.27931	8.60925	12.5916 
17.5016
Length  256, n  128, alignment  4/ 0:	13.4801	5.37114	7.47485	14.0585 
17.4517
Length  128, n  256, alignment  0/ 0:	15.3147	8.02462	8.92006	9.67769 
20.3757
Length  512, n  256, alignment  0/ 0:	11.5638	7.22535	9.80468	9.93597 
21.3421
Length  128, n  256, alignment  3/ 2:	19.8948	8.15967	10.9435	20.6146 
22.4146
Length  512, n  256, alignment  3/ 2:	18.681	7.77864	10.9269	25.9269	28.0105
Length  256, n  512, alignment  0/ 0:	19.4894	13.7363	14.8394	15.4064 
31.6341
Length 1024, n  512, alignment  0/ 0:	18.0108	11.8737	18.1779	18.5072 
41.5425
Length  256, n  512, alignment  2/ 4:	25.5662	14.5189	16.1872	29.5395 
33.7587
Length 1024, n  512, alignment  2/ 4:	28.0079	13.2347	19.067	48.1998	52.3078
Length  512, n 1024, alignment  0/ 0:	23.1385	25.4237	21.2303	23.632	47.4502
Length 2048, n 1024, alignment  0/ 0:	31.201	21.308	40.6351	39.04	75.0329
Length  512, n 1024, alignment  1/ 6:	35.6234	27.0042	24.4711	51.3364 
60.6277
Length 2048, n 1024, alignment  1/ 6:	47.442	24.0381	41.6616	85.4832	91.8897
Paul E Murphy Aug. 28, 2020, 2:25 p.m. UTC | #2
On 8/20/20 1:29 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.
> ---
>   sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 276 ++++++++++++++++++
>   sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
>   .../powerpc64/multiarch/ifunc-impl-list.c     |   5 +
>   .../powerpc64/multiarch/strncpy-power9.S      |  26 ++
>   sysdeps/powerpc/powerpc64/multiarch/strncpy.c |   7 +
>   5 files changed, 315 insertions(+), 1 deletion(-)
>   create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
>   create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..cde68384d4
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,276 @@
> +/* Optimized strncpy implementation for PowerPC64/POWER9.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +# ifndef STRNCPY
> +#  define FUNC_NAME strncpy
> +# else
> +#  define FUNC_NAME STRNCPY
> +# endif
> +
> +/* Implements the function
> +
> +   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> +   The implementation can load bytes past a null terminator, but only
> +   up to the next 16B boundary, so it never crosses a page.  */
> +
> +.machine power9
> +ENTRY_TOCLESS (FUNC_NAME, 4)
> +	CALL_MCOUNT 2
> +
> +	cmpwi   r5, 0
> +	beqlr
Trivial nit, an newline after branches helps readability for me.

> +	/* NULL string optimisation  */
> +	lbz	r0,0(r4)
> +	stb	r0,0(r3)
> +	addi	r11,r3,1
> +	addi	r5,r5,-1
> +	vspltisb v18,0		/* Zeroes in v18  */
> +	cmpwi	r0,0
> +	beq	L(zero_padding_loop) > +
> +	cmpwi	r5,0
> +	beqlr
OK.

> +
> +L(cont):
I think this label can be removed or replaced with a comment.

> +	addi	r4,r4,1
> +	neg	r7,r4
> +	rldicl	r9,r7,0,60	/* How many bytes to get source 16B aligned?  */
> +
> +	/* Get source 16B aligned  */
> +	lvx	v0,0,r4
> +	lvsr	v1,0,r4
> +	vperm	v0,v18,v0,v1
> +
> +	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	vctzlsbb r7,v6		/* Number of trailing zeroes  */
> +	addi	r8,r7,1	/* Add null terminator  */
Minor nit, can you align the comment with previous comments?

> +
> +	/* r8 = bytes including null
> +	   r9 = bytes to get source 16B aligned
> +	   if r8 > r9
> +	      no null, copy r9 bytes
> +	   else
> +	      there is a null, copy r8 bytes and return.  */
> +	cmpd	r8,r9
> +	bgt	L(no_null)
> +
> +	cmpd	r8,r5		/* r8 <= n?  */
Minor, you could use another CR and run this in parallel with the 
previous check.

> +	ble	L(null)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	blr
OK.

> +
> +L(null):
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	add	r11,r11,r8
> +	sub	r5,r5,r8
> +	b L(zero_padding_loop)
OK.

> +
> +L(no_null):
> +	cmpd	r9,r5		/* Check if length was reached.  */
> +	bge	L(n_tail1)
An extra newline would help here.

> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	add	r4,r4,r9
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
OK.

> +
> +L(loop):
> +	cmpldi	cr6,r5,64	/* Check if length was reached.  */
> +	ble	cr6,L(final_loop)
> +
> +	lxv	32+v0,0(r4)
> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail1)
> +
> +	lxv	32+v1,16(r4)
> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail2)
> +
> +	lxv	32+v2,32(r4)
> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail3)
> +
> +	lxv	32+v3,48(r4)
> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail4)
> +
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	stxv	32+v3,48(r11)
> +
> +	addi	r4,r4,64
> +	addi	r11,r11,64
> +	addi	r5,r5,-64
> +
> +	b	L(loop)
OK.

> +
> +L(final_loop):
> +	cmpldi	cr5,r5,16
> +	lxv	32+v0,0(r4)
> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail1)
> +	bne	cr6,L(count_tail1)
> +	addi	r5,r5,-16
> +
> +	cmpldi	cr5,r5,16
> +	lxv	32+v1,16(r4)
> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail2)
> +	bne	cr6,L(count_tail2)
> +	addi	r5,r5,-16
> +
> +	cmpldi	cr5,r5,16
> +	lxv	32+v2,32(r4)
> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail3)
> +	bne	cr6,L(count_tail3)
> +	addi	r5,r5,-16
OK.

> +
> +	lxv	32+v3,48(r4)
> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> +	beq	cr6,L(n_tail4)
> +
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */
> +	blt	L(tail4)
OK. Newline here (and for the other similar cases below too please).

> +L(n_tail4):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,48	/* Offset */
> +	stxvl	32+v3,r11,r10	/* Partial store  */
> +	blr
OK.

> +
> +L(prep_n_tail1):
> +	beq	cr6,L(n_tail1)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */
> +	blt	L(tail1)
> +L(n_tail1):
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +	blr
OK.

> +
> +L(prep_n_tail2):
> +	beq	cr6,L(n_tail2)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */
> +	blt	L(tail2)
> +L(n_tail2):
> +	stxv	32+v0,0(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,16	/* offset */
> +	stxvl	32+v1,r11,r10	/* Partial store  */
> +	blr
OK.

> +
> +L(prep_n_tail3):
> +	beq	cr6,L(n_tail3)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */
> +	blt	L(tail3)
> +L(n_tail3):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,32	/* Offset */
> +	stxvl	32+v2,r11,r10	/* Partial store  */
> +	blr
OK.

> +
> +L(prep_tail1):
> +L(count_tail1):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail1):
> +	addi	r9,r8,1	/* Add null terminator  */
Please align this comment (and the 3 other similar cases).
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
OK.

> +
> +L(prep_tail2):
> +	addi	r5,r5,-16
> +L(count_tail2):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail2):
> +	addi	r9,r8,1	/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,16	/* offset */
> +	stxvl	32+v1,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail3):
> +	addi	r5,r5,-32
> +L(count_tail3):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail3):
> +	addi	r9,r8,1	/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,32	/* offset */
> +	stxvl	32+v2,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail4):
> +	addi	r5,r5,-48
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail4):
> +	addi	r9,r8,1	/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,48	/* offset */
> +	stxvl	32+v3,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
OK.

> +
> +/* This code pads the remainder of dest with NULL bytes.  */
> +L(zero_padding_loop):
> +	cmpldi	cr6,r5,16	/* Check if length was reached.  */
> +	ble	cr6,L(zero_padding_end)
> +
> +	stxv	v18,0(r11)
> +	addi	r11,r11,16
> +	addi	r5,r5,-16
> +
> +	b	L(zero_padding_loop)
> +
OK.



> +L(zero_padding_end):
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	v18,r11,r10	/* Partial store  */
> +	blr

OK.

> +
> +L(n_tail):Is this label used?

> +
> +END (FUNC_NAME)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index 19acb6c64a..cd2b47b403 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile

OK.

> index ea10b00417..aa63e1c23f 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c

OK.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> new file mode 100644
> index 0000000000..b9b6092f7b
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S

OK.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> index 7bacf28aca..822ceb2003 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> @@ -28,11 +28,18 @@
>   extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
>   extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
>   extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
> +# endif
>   # undef strncpy
> 
>   /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>    ifunc symbol properly. */
>   libc_ifunc_redirected (__redirect_strncpy, strncpy,
> +# ifdef __LITTLE_ENDIAN__
> +		   (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +		   ? __strncpy_power9 :

Trivial nit, I think the above two lines need two extra spaces.
Paul A. Clarke Aug. 28, 2020, 7:12 p.m. UTC | #3
On Thu, Aug 20, 2020 at 03:29:16PM -0300, Raphael Moreira Zinsly via Libc-alpha wrote:
> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.
> ---

> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..cde68384d4
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,276 @@
> +/* Optimized strncpy implementation for PowerPC64/POWER9.

sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S below, has
"POWER9/PPC64".  Can we make these consistent?  Can we just say
"POWER9"? Do we need to indicate little-endian only?

> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +# ifndef STRNCPY
> +#  define FUNC_NAME strncpy
> +# else
> +#  define FUNC_NAME STRNCPY
> +# endif
> +
> +/* Implements the function
> +
> +   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> +   The implementation can load bytes past a null terminator, but only
> +   up to the next 16B boundary, so it never crosses a page.  */

nit, subjective: "up to the next 16-byte aligned address"

> +
> +.machine power9
> +ENTRY_TOCLESS (FUNC_NAME, 4)
> +	CALL_MCOUNT 2
> +
> +	cmpwi   r5, 0

This should be "cmpdi".

> +	beqlr
> +	/* NULL string optimisation  */

This comment would make more sense above the "cmpdi", above.

> +	lbz	r0,0(r4)
> +	stb	r0,0(r3)
> +	addi	r11,r3,1
> +	addi	r5,r5,-1
> +	vspltisb v18,0		/* Zeroes in v18  */
> +	cmpwi	r0,0

This should be "cmpdi".

> +	beq	L(zero_padding_loop)
> +

Given the above "NULL string" comment, you could
put an "empty string optimization" comment here.

> +	cmpwi	r5,0

This should be "cmpdi".

> +	beqlr

The "addi r11,r3,1" and "vspltisb v18,0" above aren't needed until
a bit later, which penalizes the empty string case.  I think you
can move the empty string test up.  Some experiments seemed to move
the lbz and dependent stb apart.  Something like this:
	/* NULL string optimisation  */
	cmpdi	r5,0
	beqlr

	lbz	r0,0(r4)
	/* empty/1-byte string optimisation  */
	cmpdi	r5,1
	stb	r0,0(r3)
	beqlr

	cmpdi	r0,0
	addi	r11,r3,1
	addi	r5,r5,-1
	vspltisb v18,0		/* Zeroes in v18  */
	beq	L(zero_padding_loop)

(But, I didn't see significant performance difference in
some light experimentation. It might be worth another look.)

> +
> +L(cont):

This label isn't used.

> +	addi	r4,r4,1
> +	neg	r7,r4
> +	rldicl	r9,r7,0,60	/* How many bytes to get source 16B aligned?  */
> +
> +	/* Get source 16B aligned  */
> +	lvx	v0,0,r4
> +	lvsr	v1,0,r4
> +	vperm	v0,v18,v0,v1
> +
> +	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	vctzlsbb r7,v6		/* Number of trailing zeroes  */
> +	addi	r8,r7,1	/* Add null terminator  */
> +
> +	/* r8 = bytes including null
> +	   r9 = bytes to get source 16B aligned
> +	   if r8 > r9
> +	      no null, copy r9 bytes
> +	   else
> +	      there is a null, copy r8 bytes and return.  */
> +	cmpd	r8,r9

This should probably be "cmpld".

> +	bgt	L(no_null)
> +
> +	cmpd	r8,r5		/* r8 <= n?  */

This should probably be "cmpld".

> +	ble	L(null)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */

Do we still need this "32+v0" syntax? Is that due to a minimum supported
level of binutils which isn't VSX-aware?

> +
> +	blr
> +
> +L(null):
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	add	r11,r11,r8
> +	sub	r5,r5,r8
> +	b L(zero_padding_loop)
> +
> +L(no_null):
> +	cmpd	r9,r5		/* Check if length was reached.  */

This should probably be "cmpld".

> +	bge	L(n_tail1)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	add	r4,r4,r9
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +
> +L(loop):
> +	cmpldi	cr6,r5,64	/* Check if length was reached.  */
> +	ble	cr6,L(final_loop)
> +
> +	lxv	32+v0,0(r4)
> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail1)
> +
> +	lxv	32+v1,16(r4)
> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail2)
> +
> +	lxv	32+v2,32(r4)
> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail3)
> +
> +	lxv	32+v3,48(r4)
> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail4)
> +
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	stxv	32+v3,48(r11)
> +
> +	addi	r4,r4,64
> +	addi	r11,r11,64
> +	addi	r5,r5,-64
> +
> +	b	L(loop)
> +
> +L(final_loop):
> +	cmpldi	cr5,r5,16
> +	lxv	32+v0,0(r4)
> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail1)
> +	bne	cr6,L(count_tail1)
> +	addi	r5,r5,-16
> +
> +	cmpldi	cr5,r5,16
> +	lxv	32+v1,16(r4)
> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail2)
> +	bne	cr6,L(count_tail2)
> +	addi	r5,r5,-16
> +
> +	cmpldi	cr5,r5,16
> +	lxv	32+v2,32(r4)
> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail3)
> +	bne	cr6,L(count_tail3)
> +	addi	r5,r5,-16
> +
> +	lxv	32+v3,48(r4)
> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> +	beq	cr6,L(n_tail4)
> +
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */

This should probably be "cmpld".

> +	blt	L(tail4)
> +L(n_tail4):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,48	/* Offset */
> +	stxvl	32+v3,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_n_tail1):
> +	beq	cr6,L(n_tail1)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */

This should probably be "cmpld".

> +	blt	L(tail1)
> +L(n_tail1):
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_n_tail2):
> +	beq	cr6,L(n_tail2)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */

This should probably be "cmpld".

> +	blt	L(tail2)
> +L(n_tail2):
> +	stxv	32+v0,0(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,16	/* offset */
> +	stxvl	32+v1,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_n_tail3):
> +	beq	cr6,L(n_tail3)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */

This should probably be "cmpld".

> +	blt	L(tail3)
> +L(n_tail3):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,32	/* Offset */
> +	stxvl	32+v2,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_tail1):
> +L(count_tail1):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail1):
> +	addi	r9,r8,1	/* Add null terminator  */
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail2):
> +	addi	r5,r5,-16
> +L(count_tail2):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail2):
> +	addi	r9,r8,1	/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,16	/* offset */
> +	stxvl	32+v1,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail3):
> +	addi	r5,r5,-32
> +L(count_tail3):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail3):
> +	addi	r9,r8,1	/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,32	/* offset */
> +	stxvl	32+v2,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail4):
> +	addi	r5,r5,-48
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail4):
> +	addi	r9,r8,1	/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,48	/* offset */
> +	stxvl	32+v3,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +
> +/* This code pads the remainder of dest with NULL bytes.  */
> +L(zero_padding_loop):
> +	cmpldi	cr6,r5,16	/* Check if length was reached.  */
> +	ble	cr6,L(zero_padding_end)
> +
> +	stxv	v18,0(r11)
> +	addi	r11,r11,16
> +	addi	r5,r5,-16
> +
> +	b	L(zero_padding_loop)
> +
> +L(zero_padding_end):
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	v18,r11,r10	/* Partial store  */
> +	blr
> +
> +L(n_tail):
> +
> +END (FUNC_NAME)

PC
Tulio Magno Quites Machado Filho Sept. 2, 2020, 1:20 p.m. UTC | #4
Raphael Moreira Zinsly via Libc-alpha <libc-alpha@sourceware.org> writes:

> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..cde68384d4
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,276 @@
> ...
> +/* Implements the function
> +
> +   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> +   The implementation can load bytes past a null terminator, but only
> +   up to the next 16B boundary, so it never crosses a page.  */
> +
> +.machine power9

I don't think Binutils 2.26 supports .machine power9.  Likewise for all P9
instructions.  However, current glibc is expected to work with Binutils 2.26
(ppc64le), i.e. builds with Binutils 2.26 should not fail.

So, we either need to change this code (e.g. similar to strcmp) or we need
to bump the Binutils requirements.
The last time Binutils requirements was bumped was in 2017, so I think it's safe
to do this now.

Let me prepare a patch proposing this.
Paul E Murphy Sept. 2, 2020, 2 p.m. UTC | #5
On 9/2/20 8:20 AM, Tulio Magno Quites Machado Filho wrote:
> Raphael Moreira Zinsly via Libc-alpha <libc-alpha@sourceware.org> writes:
> 
>> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
>> new file mode 100644
>> index 0000000000..cde68384d4
>> --- /dev/null
>> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
>> @@ -0,0 +1,276 @@
>> ...
>> +/* Implements the function
>> +
>> +   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
>> +
>> +   The implementation can load bytes past a null terminator, but only
>> +   up to the next 16B boundary, so it never crosses a page.  */
>> +
>> +.machine power9
> 
> I don't think Binutils 2.26 supports .machine power9.  Likewise for all P9
> instructions.  However, current glibc is expected to work with Binutils 2.26
> (ppc64le), i.e. builds with Binutils 2.26 should not fail.
> 
> So, we either need to change this code (e.g. similar to strcmp) or we need
> to bump the Binutils requirements.
> The last time Binutils requirements was bumped was in 2017, so I think it's safe
> to do this now.
> 
> Let me prepare a patch proposing this.

There are at least 5 uses of .machine power9 throughout glibc today.  I 
agree with bumping at least the ppc64le requirements to match.
diff mbox series

Patch

diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
new file mode 100644
index 0000000000..cde68384d4
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -0,0 +1,276 @@ 
+/* Optimized strncpy implementation for PowerPC64/POWER9.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+# ifndef STRNCPY
+#  define FUNC_NAME strncpy
+# else
+#  define FUNC_NAME STRNCPY
+# endif
+
+/* Implements the function
+
+   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   The implementation can load bytes past a null terminator, but only
+   up to the next 16B boundary, so it never crosses a page.  */
+
+.machine power9
+ENTRY_TOCLESS (FUNC_NAME, 4)
+	CALL_MCOUNT 2
+
+	cmpwi   r5, 0
+	beqlr
+	/* NULL string optimisation  */
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+	addi	r11,r3,1
+	addi	r5,r5,-1
+	vspltisb v18,0		/* Zeroes in v18  */
+	cmpwi	r0,0
+	beq	L(zero_padding_loop)
+
+	cmpwi	r5,0
+	beqlr
+
+L(cont):
+	addi	r4,r4,1
+	neg	r7,r4
+	rldicl	r9,r7,0,60	/* How many bytes to get source 16B aligned?  */
+
+	/* Get source 16B aligned  */
+	lvx	v0,0,r4
+	lvsr	v1,0,r4
+	vperm	v0,v18,v0,v1
+
+	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	vctzlsbb r7,v6		/* Number of trailing zeroes  */
+	addi	r8,r7,1	/* Add null terminator  */
+
+	/* r8 = bytes including null
+	   r9 = bytes to get source 16B aligned
+	   if r8 > r9
+	      no null, copy r9 bytes
+	   else
+	      there is a null, copy r8 bytes and return.  */
+	cmpd	r8,r9
+	bgt	L(no_null)
+
+	cmpd	r8,r5		/* r8 <= n?  */
+	ble	L(null)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	blr
+
+L(null):
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	add	r11,r11,r8
+	sub	r5,r5,r8
+	b L(zero_padding_loop)
+
+L(no_null):
+	cmpd	r9,r5		/* Check if length was reached.  */
+	bge	L(n_tail1)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	add	r4,r4,r9
+	add	r11,r11,r9
+	sub	r5,r5,r9
+
+L(loop):
+	cmpldi	cr6,r5,64	/* Check if length was reached.  */
+	ble	cr6,L(final_loop)
+
+	lxv	32+v0,0(r4)
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail1)
+
+	lxv	32+v1,16(r4)
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail2)
+
+	lxv	32+v2,32(r4)
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail3)
+
+	lxv	32+v3,48(r4)
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail4)
+
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+
+	addi	r4,r4,64
+	addi	r11,r11,64
+	addi	r5,r5,-64
+
+	b	L(loop)
+
+L(final_loop):
+	cmpldi	cr5,r5,16
+	lxv	32+v0,0(r4)
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail1)
+	bne	cr6,L(count_tail1)
+	addi	r5,r5,-16
+
+	cmpldi	cr5,r5,16
+	lxv	32+v1,16(r4)
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail2)
+	bne	cr6,L(count_tail2)
+	addi	r5,r5,-16
+
+	cmpldi	cr5,r5,16
+	lxv	32+v2,32(r4)
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail3)
+	bne	cr6,L(count_tail3)
+	addi	r5,r5,-16
+
+	lxv	32+v3,48(r4)
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
+	beq	cr6,L(n_tail4)
+
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpd	r8,r5		/* r8 < n?  */
+	blt	L(tail4)
+L(n_tail4):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,48	/* Offset */
+	stxvl	32+v3,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail1):
+	beq	cr6,L(n_tail1)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpd	r8,r5		/* r8 < n?  */
+	blt	L(tail1)
+L(n_tail1):
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail2):
+	beq	cr6,L(n_tail2)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpd	r8,r5		/* r8 < n?  */
+	blt	L(tail2)
+L(n_tail2):
+	stxv	32+v0,0(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,16	/* offset */
+	stxvl	32+v1,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail3):
+	beq	cr6,L(n_tail3)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpd	r8,r5		/* r8 < n?  */
+	blt	L(tail3)
+L(n_tail3):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,32	/* Offset */
+	stxvl	32+v2,r11,r10	/* Partial store  */
+	blr
+
+L(prep_tail1):
+L(count_tail1):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail1):
+	addi	r9,r8,1	/* Add null terminator  */
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding_loop)
+
+L(prep_tail2):
+	addi	r5,r5,-16
+L(count_tail2):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail2):
+	addi	r9,r8,1	/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,16	/* offset */
+	stxvl	32+v1,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding_loop)
+
+L(prep_tail3):
+	addi	r5,r5,-32
+L(count_tail3):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail3):
+	addi	r9,r8,1	/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,32	/* offset */
+	stxvl	32+v2,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding_loop)
+
+L(prep_tail4):
+	addi	r5,r5,-48
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail4):
+	addi	r9,r8,1	/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,48	/* offset */
+	stxvl	32+v3,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+
+/* This code pads the remainder of dest with NULL bytes.  */
+L(zero_padding_loop):
+	cmpldi	cr6,r5,16	/* Check if length was reached.  */
+	ble	cr6,L(zero_padding_end)
+
+	stxv	v18,0(r11)
+	addi	r11,r11,16
+	addi	r5,r5,-16
+
+	b	L(zero_padding_loop)
+
+L(zero_padding_end):
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	v18,r11,r10	/* Partial store  */
+	blr
+
+L(n_tail):
+
+END (FUNC_NAME)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 19acb6c64a..cd2b47b403 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@  sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
-		   rawmemchr-power9 strlen-power9
+		   rawmemchr-power9 strlen-power9 strncpy-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index ea10b00417..aa63e1c23f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -301,6 +301,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __strncpy_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, strncpy,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
new file mode 100644
index 0000000000..b9b6092f7b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
@@ -0,0 +1,26 @@ 
+/* Optimized strncpy implementation for POWER9/PPC64.
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRNCPY __strncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index 7bacf28aca..822ceb2003 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -28,11 +28,18 @@ 
 extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
 extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
 extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
+# endif
 # undef strncpy
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
  ifunc symbol properly. */
 libc_ifunc_redirected (__redirect_strncpy, strncpy,
+# ifdef __LITTLE_ENDIAN__
+		   (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+		   ? __strncpy_power9 :
+# endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __strncpy_power8
 		       : (hwcap & PPC_FEATURE_HAS_VSX)