diff mbox

powerpc: Optimized strcat for POWER8/PPC64

Message ID 549AAEC6.2090006@linux.vnet.ibm.com
State New
Headers show

Commit Message

Adhemerval Zanella Dec. 24, 2014, 12:17 p.m. UTC
Hi,

With new optimized strcpy for POWER8 [1], this patch adds an optimized
strcat which uses it along with default implementation at strings/.

I see good improvements over POWER7 version on POWER8 machine, specially
for unaligned cases (where the new strcpy aims to optimize).  Benchtests
result in attachments.

Tested on powerpc64 and powerpc64le.

[1] https://sourceware.org/ml/libc-alpha/2014-12/msg00878.html

--

 	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
	strncat-power8 object.
	* sysdeps/powerpc/powerpc64/multiarch/strcat.c (strcat): Add
	__strcat_power8 implementation.
	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Add __strcat_power8 implementation.
	* sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c: New file:
	optimized strcat for power8.

--
simple_strcat	__strcat_power8	__strcat_power7	__strcat_ppc
Length    0/   0, alignment  0/ 0:	2.53125	8.21875	8.46875	11.8281
Length    0/   0, alignment  0/ 0:	1.95312	7.73438	9.01562	11.8281
Length    0/   0, alignment  0/ 0:	1.9375	9.03125	8.6875	11.8594
Length    0/   0, alignment  0/ 0:	1.9375	8.5	8.5625	11.8438
Length    1/   1, alignment  0/ 0:	3.45312	8.15625	9.82812	8.1875
Length    1/   1, alignment  0/ 0:	2.95312	8.15625	9.73438	8.04688
Length    1/   1, alignment  0/ 1:	2.85938	7.79688	9.3125	8.10938
Length    1/   1, alignment  1/ 0:	2.8125	7.70312	10.625	8.07812
Length    2/   2, alignment  0/ 0:	4.29688	8.79688	10.9219	8.71875
Length    2/   2, alignment  0/ 0:	3.92188	8.82812	10.875	8.65625
Length    2/   2, alignment  0/ 2:	3.6875	8.23438	10.9375	12.1875
Length    2/   2, alignment  2/ 0:	3.79688	8.57812	11.75	8.625
Length    3/   3, alignment  0/ 0:	4.84375	8.8125	11.9844	9.45312
Length    3/   3, alignment  0/ 0:	4.5	8.25	12.2812	9.39062
Length    3/   3, alignment  0/ 3:	4.4375	8.125	10.1875	9.29688
Length    3/   3, alignment  3/ 0:	4.40625	7.76562	12.6094	9.375
Length    4/   4, alignment  0/ 0:	5.625	8.40625	12.6719	11.5469
Length    4/   4, alignment  0/ 0:	5.39062	7.8125	12.5156	11.4219
Length    4/   4, alignment  0/ 4:	5.32812	8.01562	12.5312	10.1406
Length    4/   4, alignment  4/ 0:	5.25	8.26562	13.1406	11.5781
Length    5/   5, alignment  0/ 0:	6.375	8.4375	12.3594	9.875
Length    5/   5, alignment  0/ 0:	6.26562	8.51562	12.3438	10
Length    5/   5, alignment  0/ 5:	6.07812	9.01562	12.9844	10.2969
Length    5/   5, alignment  5/ 0:	6.01562	8.39062	12.6875	9.875
Length    6/   6, alignment  0/ 0:	7.29688	8.65625	11.6406	10.3438
Length    6/   6, alignment  0/ 0:	6.82812	8.125	10.7188	10.4375
Length    6/   6, alignment  0/ 6:	6.92188	8.3125	12.7188	12.1094
Length    6/   6, alignment  6/ 0:	6.85938	7.98438	12.0312	10.4219
Length    7/   7, alignment  0/ 0:	8.15625	8.17188	11.5	10.6562
Length    7/   7, alignment  0/ 0:	7.67188	9.01562	11.125	10.75
Length    7/   7, alignment  0/ 7:	7.65625	7.65625	11.9219	11.0625
Length    7/   7, alignment  7/ 0:	7.5625	8.70312	11.3438	10.5469
Length    8/   8, alignment  0/ 0:	8.89062	9.03125	8.35938	11.9219
Length    8/   8, alignment  0/ 0:	8.67188	8.40625	8.42188	11.9219
Length    8/   8, alignment  0/ 0:	8.5625	8.39062	8.25	11.9219
Length    8/   8, alignment  0/ 0:	8.59375	8.29688	8.34375	12.0312
Length    9/   9, alignment  0/ 0:	10.0312	8.3125	11.0312	11.4844
Length    9/   9, alignment  0/ 0:	9.4375	8.39062	10.6094	11.6094
Length    9/   9, alignment  0/ 1:	9.42188	8.51562	11.0781	11.5312
Length    9/   9, alignment  1/ 0:	9.45312	8.15625	15.2188	11.6094
Length   10/  10, alignment  0/ 0:	10.8281	8.89062	10.9844	11.9375
Length   10/  10, alignment  0/ 0:	10.5781	8.5	11.1719	11.8594
Length   10/  10, alignment  0/ 2:	10.5156	8.17188	11	13.1406
Length   10/  10, alignment  2/ 0:	10.4844	8.39062	13.5469	11.7656
Length   11/  11, alignment  0/ 0:	16.7188	8.28125	11.5938	12.0938
Length   11/  11, alignment  0/ 0:	16.6406	8.26562	11.4531	12.1094
Length   11/  11, alignment  0/ 3:	16.5312	7.60938	11.9844	12.1875
Length   11/  11, alignment  3/ 0:	16.75	8.34375	13.0938	12.3438
Length   12/  12, alignment  0/ 0:	18.6719	8.3125	11.2969	12.9531
Length   12/  12, alignment  0/ 0:	18.2656	8.10938	11.125	12.9375
Length   12/  12, alignment  0/ 4:	18.2188	9.57812	13.625	10.875
Length   12/  12, alignment  4/ 0:	18.1875	8.51562	10.2969	13
Length   13/  13, alignment  0/ 0:	19.2812	8.46875	12.875	12.7969
Length   13/  13, alignment  0/ 0:	19.0938	8.23438	12.25	12.6875
Length   13/  13, alignment  0/ 5:	19.2188	9.4375	17.0781	13.9531
Length   13/  13, alignment  5/ 0:	18.9531	8.46875	11.9062	12.9844
Length   14/  14, alignment  0/ 0:	20.6875	8.48438	13.1875	13.25
Length   14/  14, alignment  0/ 0:	20.5	7.92188	12.75	13.3125
Length   14/  14, alignment  0/ 6:	20.3281	9.26562	14.9844	14.25
Length   14/  14, alignment  6/ 0:	20.9062	8.40625	11.125	13.2031
Length   15/  15, alignment  0/ 0:	21.7969	8.32812	13.4688	13.3125
Length   15/  15, alignment  0/ 0:	21.5312	8.65625	13.2344	13.4219
Length   15/  15, alignment  0/ 7:	21.6719	9.34375	13.5938	14.3125
Length   15/  15, alignment  7/ 0:	21.6562	8.01562	10.5625	13.5156
Length   16/  16, alignment  0/ 0:	22.9844	8.70312	11.1094	13.4375
Length   16/  16, alignment  7/ 2:	23.1562	9.92188	18.7344	14.8281
Length   16/   4, alignment  0/ 0:	10.9531	9.34375	10.625	13.9375
Length   16/   4, alignment  7/ 2:	10.5	9.51562	14.625	13.6719
Length   32/  32, alignment  0/ 0:	44.3594	9.54688	11	14.6406
Length   32/  32, alignment  6/ 4:	44.125	9.28125	16.3438	20.75
Length   32/   8, alignment  0/ 0:	22.4219	9.07812	9.65625	13.6562
Length   32/   8, alignment  6/ 4:	22.1562	9.54688	15.3594	19.7188
Length   64/  64, alignment  0/ 0:	80.1875	12.375	10.9219	16.2969
Length   64/  64, alignment  5/ 6:	79.8906	10.6094	18.9062	33.2656
Length   64/  16, alignment  0/ 0:	45.4531	10.0781	10.9844	15.5781
Length   64/  16, alignment  5/ 6:	45.9219	9.39062	18.0781	33.2812
Length  128/ 128, alignment  0/ 0:	149.188	15.5312	13.9531	21.3281
Length  128/ 128, alignment  4/ 0:	148.797	15.2812	22.8438	27.3906
Length  128/  32, alignment  0/ 0:	82.0312	12.7656	13.8906	19.6875
Length  128/  32, alignment  4/ 0:	81.75	12.2344	21.3438	26.2656
Length  256/ 256, alignment  0/ 0:	288.547	26.0938	25.5156	30.9062
Length  256/ 256, alignment  3/ 2:	287.625	25.6875	38.5	117.781
Length  256/  64, alignment  0/ 0:	154.344	21.0156	20.9688	25.8594
Length  256/  64, alignment  3/ 2:	153.969	20.8438	34.2188	109.375
Length  512/ 512, alignment  0/ 0:	566	45.4062	48.1562	54.2031
Length  512/ 512, alignment  2/ 4:	566.125	44.6719	62.7188	224.078
Length  512/ 128, alignment  0/ 0:	298.453	31.1094	33.9062	40.4375
Length  512/ 128, alignment  2/ 4:	299.234	30.875	47.9531	210.5
Length 1024/1024, alignment  0/ 0:	1123.06	75.2344	86.3125	94.7031
Length 1024/1024, alignment  1/ 6:	1125.52	74.6094	109.438	436.594
Length 1024/ 256, alignment  0/ 0:	587.828	50.6875	62.7344	70.3125
Length 1024/ 256, alignment  1/ 6:	587.422	50.0469	85.7656	411.891
Length   16/   1, alignment  1/ 2:	9.65625	9.60938	11.6875	13.5469
Length   16/   1, alignment  2/ 1:	9.28125	9.4375	14.25	13.5156
Length   16/  10, alignment  1/ 1:	13.2656	9.17188	11.375	13.875
Length   16/  10, alignment  1/ 1:	12.9062	8.9375	11.4062	13.8438
Length   32/   1, alignment  2/ 4:	19.5625	10.0312	13.4844	19.8906
Length   32/   1, alignment  4/ 2:	19.2969	9.60938	15.875	19.8438
Length   32/  10, alignment  2/ 2:	23.3281	8.34375	12.5156	19.8125
Length   32/  10, alignment  2/ 2:	22.9688	8.8125	12	19.6875
Length   64/   1, alignment  3/ 6:	32.4688	10.0938	17.7812	32.7656
Length   64/   1, alignment  6/ 3:	32.1094	9.48438	17.6875	32.4219
Length   64/  10, alignment  3/ 3:	36.0156	9.6875	19.2031	32.4375
Length   64/  10, alignment  3/ 3:	35.9844	9.42188	18.7656	32.3594
Length  128/   1, alignment  4/ 0:	57.0625	11.6719	23.8125	57.75
Length  128/   1, alignment  0/ 4:	57.1719	11.6562	20.5156	58.0156
Length  128/  10, alignment  4/ 4:	61.6562	11.5625	22.3438	57.9375
Length  128/  10, alignment  4/ 4:	62.0781	12.4688	22	58.0781
Length  256/   1, alignment  5/ 2:	106.906	19.2188	32.3281	108.594
Length  256/   1, alignment  2/ 5:	106.922	19.6562	30.0312	108.594
Length  256/  10, alignment  5/ 5:	112.891	18.5938	30.9219	108.828
Length  256/  10, alignment  5/ 5:	113.578	18.5469	30.9219	108.5
Length  512/   1, alignment  6/ 4:	207.109	26.75	49.25	208.562
Length  512/   1, alignment  4/ 6:	206.531	27.375	47.5312	208.703
Length  512/  10, alignment  6/ 6:	216.219	27.8594	47.0938	209
Length  512/  10, alignment  6/ 6:	215.688	28	46.6406	209.578
Length 1024/   1, alignment  7/ 6:	406.062	42.6719	62.2812	408.266
Length 1024/   1, alignment  6/ 7:	407.625	43.7344	79.0469	407.922
Length 1024/  10, alignment  7/ 7:	420.484	43.7969	82.6719	408.297
Length 1024/  10, alignment  7/ 7:	420.781	43.6719	81.9844	408.875

Comments

Ondřej Bílka Jan. 3, 2015, 10:10 a.m. UTC | #1
On Wed, Dec 24, 2014 at 10:17:10AM -0200, Adhemerval Zanella wrote:
> Hi,
> 
> With new optimized strcpy for POWER8 [1], this patch adds an optimized
> strcat which uses it along with default implementation at strings/.
> 
> I see good improvements over POWER7 version on POWER8 machine, specially
> for unaligned cases (where the new strcpy aims to optimize).  Benchtests
> result in attachments.
> 
> Tested on powerpc64 and powerpc64le.
> 
So its just generic strcat where you use ifunc to avoid plt in strcpy
call. Seems ok.
diff mbox

Patch

diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index f170551..74b2daa 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -18,8 +18,8 @@  sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
 		   strncpy-power7 strncpy-ppc64 \
 		   stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
-		   strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
-		   bcopy-ppc64
+		   strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
+		   memmove-ppc64 bcopy-ppc64
 
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 2a7e7f5..d5b2184 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -303,6 +303,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c.  */
   IFUNC_IMPL (i, name, strcat,
 	      IFUNC_IMPL_ADD (array, i, strcat,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strcat_power8)
+	      IFUNC_IMPL_ADD (array, i, strcat,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __strcat_power7)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c b/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
new file mode 100644
index 0000000..3dc0ef6
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
@@ -0,0 +1,30 @@ 
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/ >.  */
+
+#include <string.h>
+
+#define STRCAT __strcat_power8
+
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+extern typeof (strcpy) __strcpy_power8;
+extern typeof (strlen) __strlen_power7;
+
+#define strcpy __strcpy_power8
+#define strlen __strlen_power7
+#include <sysdeps/powerpc/strcat.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
index ec21062..b6f58a3 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
@@ -23,9 +23,12 @@ 
 
 extern __typeof (strcat) __strcat_ppc attribute_hidden;
 extern __typeof (strcat) __strcat_power7 attribute_hidden;
+extern __typeof (strcat) __strcat_power8 attribute_hidden;
 
 libc_ifunc (strcat,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcat_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strcat_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strcat_power7
             : __strcat_ppc);
 #endif