diff mbox series

PING [PATCH] x86: Update memcpy/memset inline strategies for -mtune=generic

Message ID YSJtKKbBGoDI4hOd@gmail.com
State New
Headers show
Series PING [PATCH] x86: Update memcpy/memset inline strategies for -mtune=generic | expand

Commit Message

H.J. Lu Aug. 22, 2021, 3:28 p.m. UTC
On Tue, Mar 23, 2021 at 09:19:38AM +0100, Richard Biener wrote:
> On Tue, Mar 23, 2021 at 3:41 AM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> >
> > > Hongyue, please collect code size differences on SPEC CPU 2017 and
> > > eembc.
> >
> > Here is code size difference for this patch
> 
> Thanks, nothing too bad although slightly larger impacts than envisioned.
> 

PING.

OK for master branch?

Thanks.

H.J.
 ---
Simplify memcpy and memset inline strategies to avoid branches for
-mtune=generic:

1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
   load and store for up to 16 * 16 (256) bytes when the data size is
   fixed and known.
2. Inline only if data size is known to be <= 256.
   a. Use "rep movsb/stosb" with simple code sequence if the data size
      is a constant.
   b. Use loop if data size is not a constant.
3. Use memcpy/memset libray function if data size is unknown or > 256.

With -mtune=generic -O2,

1. On Ice Lake processor,

Performance impacts on SPEC CPU 2017:

500.perlbench_r  0.51%
502.gcc_r        0.55%
505.mcf_r        0.38%
520.omnetpp_r   -0.74%
523.xalancbmk_r -0.35%
525.x264_r       2.99%
531.deepsjeng_r -0.17%
541.leela_r     -0.98%
548.exchange2_r  0.89%
557.xz_r         0.70%
Geomean          0.37%

503.bwaves_r     0.04%
507.cactuBSSN_r -0.01%
508.namd_r      -0.45%
510.parest_r    -0.09%
511.povray_r    -1.37%
519.lbm_r        0.00%
521.wrf_r       -2.56%
526.blender_r   -0.01%
527.cam4_r      -0.05%
538.imagick_r    0.36%
544.nab_r        0.08%
549.fotonik3d_r -0.06%
554.roms_r       0.05%
Geomean         -0.34%

Significant impacts on eembc benchmarks:

eembc/nnet_test      14.85%
eembc/mp2decoddata2  13.57%

2. On Cascadelake processor,

Performance impacts on SPEC CPU 2017:

500.perlbench_r -0.02%
502.gcc_r        0.10%
505.mcf_r       -1.14%
520.omnetpp_r   -0.22%
523.xalancbmk_r  0.21%
525.x264_r       0.94%
531.deepsjeng_r -0.37%
541.leela_r     -0.46%
548.exchange2_r -0.40%
557.xz_r         0.60%
Geomean         -0.08%

503.bwaves_r    -0.50%
507.cactuBSSN_r  0.05%
508.namd_r      -0.02%
510.parest_r     0.09%
511.povray_r    -1.35%
519.lbm_r        0.00%
521.wrf_r       -0.03%
526.blender_r   -0.83%
527.cam4_r       1.23%
538.imagick_r    0.97%
544.nab_r       -0.02%
549.fotonik3d_r -0.12%
554.roms_r       0.55%
Geomean          0.00%

Significant impacts on eembc benchmarks:

eembc/nnet_test      9.90%
eembc/mp2decoddata2  16.42%
eembc/textv2data3   -4.86%
eembc/qos            12.90%

3. On Znver3 processor,

Performance impacts on SPEC CPU 2017:

500.perlbench_r -0.96%
502.gcc_r       -1.06%
505.mcf_r       -0.01%
520.omnetpp_r   -1.45%
523.xalancbmk_r  2.89%
525.x264_r       4.98%
531.deepsjeng_r  0.18%
541.leela_r     -1.54%
548.exchange2_r -1.25%
557.xz_r        -0.01%
Geomean          0.16%

503.bwaves_r     0.04%
507.cactuBSSN_r  0.85%
508.namd_r      -0.13%
510.parest_r     0.39%
511.povray_r     0.00%
519.lbm_r        0.00%
521.wrf_r        0.28%
526.blender_r   -0.10%
527.cam4_r      -0.58%
538.imagick_r    0.69%
544.nab_r       -0.04%
549.fotonik3d_r -0.04%
554.roms_r       0.40%
Geomean          0.15%

Significant impacts on eembc benchmarks:

eembc/aifftr01       13.95%
eembc/idctrn01       8.41%
eembc/nnet_test      30.25%
eembc/mp2decoddata2  5.05%
eembc/textv2data3    6.43%
eembc/qos           -5.79%

Code size differences are:

SPEC CPU 2017

                  difference      w patch      w/o patch
500.perlbench_r     0.051%        1622637      1621805
502.gcc_r           0.039%        6930877      6928141
505.mcf_r           0.098%        16413        16397
520.omnetpp_r       0.083%        1327757      1326653
523.xalancbmk_r     0.001%        3575709      3575677
525.x264_r         -0.067%        769095       769607
531.deepsjeng_r     0.071%        67629        67581
541.leela_r        -3.062%        127629       131661
548.exchange2_r    -0.338%        66141        66365
557.xz_r            0.946%        128061       126861
503.bwaves_r        0.534%        33117        32941
507.cactuBSSN_r     0.004%        2993645      2993517
508.namd_r          0.006%        851677       851629
510.parest_r        0.488%        6741277      6708557
511.povray_r       -0.021%        849290       849466
521.wrf_r           0.022%        29682154     29675530
526.blender_r       0.054%        7544057      7540009
527.cam4_r          0.043%        6102234      6099594
538.imagick_r      -0.015%        1625770      1626010
544.nab_r           0.155%        155453       155213
549.fotonik3d_r     0.000%        351757       351757
554.roms_r          0.041%        735837       735533

eembc

aifftr01            0.762%        14813        14701
aiifft01            0.556%        14477        14397
idctrn01            0.101%        15853        15837
cjpeg-rose7-preset  0.114%        56125        56061
nnet_test          -0.848%        35549        35853
aes                 0.125%        38493        38445
cjpegv2data         0.108%        59213        59149
djpegv2data         0.025%        63821        63805
huffde             -0.104%        30621        30653
mp2decoddata       -0.047%        68285        68317
mp2enf32data1       0.018%        86925        86909
mp2enf32data2       0.018%        89357        89341
mp2enf32data3       0.018%        88253        88237
mp3playerfixeddata  0.103%        46877        46829
ip_pktcheckb1m      0.191%        25213        25165
nat                 0.527%        45757        45517
ospfv2              0.196%        24573        24525
routelookup         0.189%        25389        25341
tcpbulk             0.155%        30925        30877
textv2data          0.055%        29101        29085

gcc/

	* config/i386/x86-tune-costs.h (generic_memcpy): Updated.
	(generic_memset): Likewise.
	(generic_cost): Change CLEAR_RATIO to 17.
	* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
	Add m_GENERIC.

gcc/testsuite/

	* gcc.target/i386/memcpy-strategy-12.c: New test.
	* gcc.target/i386/memcpy-strategy-13.c: Likewise.
	* gcc.target/i386/memset-strategy-10.c: Likewise.
	* gcc.target/i386/memset-strategy-11.c: Likewise.
	* gcc.target/i386/shrink_wrap_1.c: Also pass
	-mmemset-strategy=rep_8byte:-1:align.
	* gcc.target/i386/sw-1.c: Also pass -mstringop-strategy=rep_byte.
---
 gcc/config/i386/x86-tune-costs.h              | 31 ++++++++++++-------
 gcc/config/i386/x86-tune.def                  |  2 +-
 .../gcc.target/i386/memcpy-strategy-12.c      |  9 ++++++
 .../gcc.target/i386/memcpy-strategy-13.c      | 11 +++++++
 .../gcc.target/i386/memset-strategy-10.c      | 11 +++++++
 .../gcc.target/i386/memset-strategy-11.c      |  9 ++++++
 gcc/testsuite/gcc.target/i386/shrink_wrap_1.c |  2 +-
 gcc/testsuite/gcc.target/i386/sw-1.c          |  2 +-
 8 files changed, 63 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-11.c

Comments

H.J. Lu Sept. 8, 2021, 3:01 a.m. UTC | #1
On Sun, Aug 22, 2021 at 8:28 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Mar 23, 2021 at 09:19:38AM +0100, Richard Biener wrote:
> > On Tue, Mar 23, 2021 at 3:41 AM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> > >
> > > > Hongyue, please collect code size differences on SPEC CPU 2017 and
> > > > eembc.
> > >
> > > Here is code size difference for this patch
> >
> > Thanks, nothing too bad although slightly larger impacts than envisioned.
> >
>
> PING.
>
> OK for master branch?
>
> Thanks.
>
> H.J.
>  ---
> Simplify memcpy and memset inline strategies to avoid branches for
> -mtune=generic:
>
> 1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
>    load and store for up to 16 * 16 (256) bytes when the data size is
>    fixed and known.
> 2. Inline only if data size is known to be <= 256.
>    a. Use "rep movsb/stosb" with simple code sequence if the data size
>       is a constant.
>    b. Use loop if data size is not a constant.
> 3. Use memcpy/memset libray function if data size is unknown or > 256.
>

PING:

https://gcc.gnu.org/pipermail/gcc-patches/2021-August/577889.html
H.J. Lu Sept. 13, 2021, 1:38 p.m. UTC | #2
On Tue, Sep 7, 2021 at 8:01 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sun, Aug 22, 2021 at 8:28 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Mar 23, 2021 at 09:19:38AM +0100, Richard Biener wrote:
> > > On Tue, Mar 23, 2021 at 3:41 AM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> > > >
> > > > > Hongyue, please collect code size differences on SPEC CPU 2017 and
> > > > > eembc.
> > > >
> > > > Here is code size difference for this patch
> > >
> > > Thanks, nothing too bad although slightly larger impacts than envisioned.
> > >
> >
> > PING.
> >
> > OK for master branch?
> >
> > Thanks.
> >
> > H.J.
> >  ---
> > Simplify memcpy and memset inline strategies to avoid branches for
> > -mtune=generic:
> >
> > 1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
> >    load and store for up to 16 * 16 (256) bytes when the data size is
> >    fixed and known.
> > 2. Inline only if data size is known to be <= 256.
> >    a. Use "rep movsb/stosb" with simple code sequence if the data size
> >       is a constant.
> >    b. Use loop if data size is not a constant.
> > 3. Use memcpy/memset libray function if data size is unknown or > 256.
> >
>
> PING:
>
> https://gcc.gnu.org/pipermail/gcc-patches/2021-August/577889.html
>

PING.  This should fix:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102294
H.J. Lu Sept. 20, 2021, 5:06 p.m. UTC | #3
On Mon, Sep 13, 2021 at 6:38 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Sep 7, 2021 at 8:01 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sun, Aug 22, 2021 at 8:28 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Tue, Mar 23, 2021 at 09:19:38AM +0100, Richard Biener wrote:
> > > > On Tue, Mar 23, 2021 at 3:41 AM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> > > > >
> > > > > > Hongyue, please collect code size differences on SPEC CPU 2017 and
> > > > > > eembc.
> > > > >
> > > > > Here is code size difference for this patch
> > > >
> > > > Thanks, nothing too bad although slightly larger impacts than envisioned.
> > > >
> > >
> > > PING.
> > >
> > > OK for master branch?
> > >
> > > Thanks.
> > >
> > > H.J.
> > >  ---
> > > Simplify memcpy and memset inline strategies to avoid branches for
> > > -mtune=generic:
> > >
> > > 1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
> > >    load and store for up to 16 * 16 (256) bytes when the data size is
> > >    fixed and known.
> > > 2. Inline only if data size is known to be <= 256.
> > >    a. Use "rep movsb/stosb" with simple code sequence if the data size
> > >       is a constant.
> > >    b. Use loop if data size is not a constant.
> > > 3. Use memcpy/memset libray function if data size is unknown or > 256.
> > >
> >
> > PING:
> >
> > https://gcc.gnu.org/pipermail/gcc-patches/2021-August/577889.html
> >
>
> PING.  This should fix:
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102294
>

PING.
H.J. Lu Oct. 1, 2021, 3:24 p.m. UTC | #4
On Mon, Sep 20, 2021 at 10:06 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Sep 13, 2021 at 6:38 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Sep 7, 2021 at 8:01 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Sun, Aug 22, 2021 at 8:28 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Tue, Mar 23, 2021 at 09:19:38AM +0100, Richard Biener wrote:
> > > > > On Tue, Mar 23, 2021 at 3:41 AM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
> > > > > >
> > > > > > > Hongyue, please collect code size differences on SPEC CPU 2017 and
> > > > > > > eembc.
> > > > > >
> > > > > > Here is code size difference for this patch
> > > > >
> > > > > Thanks, nothing too bad although slightly larger impacts than envisioned.
> > > > >
> > > >
> > > > PING.
> > > >
> > > > OK for master branch?
> > > >
> > > > Thanks.
> > > >
> > > > H.J.
> > > >  ---
> > > > Simplify memcpy and memset inline strategies to avoid branches for
> > > > -mtune=generic:
> > > >
> > > > 1. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
> > > >    load and store for up to 16 * 16 (256) bytes when the data size is
> > > >    fixed and known.
> > > > 2. Inline only if data size is known to be <= 256.
> > > >    a. Use "rep movsb/stosb" with simple code sequence if the data size
> > > >       is a constant.
> > > >    b. Use loop if data size is not a constant.
> > > > 3. Use memcpy/memset libray function if data size is unknown or > 256.
> > > >
> > >
> > > PING:
> > >
> > > https://gcc.gnu.org/pipermail/gcc-patches/2021-August/577889.html
> > >
> >
> > PING.  This should fix:
> >
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102294
> >
>
> PING.
>

Any comments or objections to this patch?
diff mbox series

Patch

diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index ffe810f2bcb..30e7c3e4261 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2844,19 +2844,28 @@  struct processor_costs intel_cost = {
   "16",					/* Func alignment.  */
 };
 
-/* Generic should produce code tuned for Core-i7 (and newer chips)
-   and btver1 (and newer chips).  */
+/* Generic should produce code tuned for Haswell (and newer chips)
+   and znver1 (and newer chips).  NB: rep_prefix_1_byte is used only
+   for known size.  */
 
 static stringop_algs generic_memcpy[2] = {
-  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
-             {-1, libcall, false}}},
-  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}}};
 static stringop_algs generic_memset[2] = {
-  {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
-             {-1, libcall, false}}},
-  {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
-             {-1, libcall, false}}}};
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+    {256, loop, false},
+    {-1, libcall, false}}}};
 static const
 struct processor_costs generic_cost = {
   {
@@ -2913,7 +2922,7 @@  struct processor_costs generic_cost = {
   COSTS_N_INSNS (1),			/* cost of movzx */
   8,					/* "large" insn */
   17,					/* MOVE_RATIO */
-  6,					/* CLEAR_RATIO */
+  17,					/* CLEAR_RATIO */
   {6, 6, 6},				/* cost of loading integer registers
 					   in QImode, HImode and SImode.
 					   Relative to reg-reg move (2).  */
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 8f55da89c92..a9a023f33f5 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -273,7 +273,7 @@  DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
    move/set sequences of bytes with known size.  */
 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
 	  "prefer_known_rep_movsb_stosb",
-	  m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
+	  m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512 | m_GENERIC)
 
 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
    compact prologues and epilogues by issuing a misaligned moves.  This
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
new file mode 100644
index 00000000000..e9998b70ab2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-12.c
@@ -0,0 +1,9 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic -mno-sse" } */
+/* { dg-final { scan-assembler "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+  __builtin_memcpy (dest, src, 249);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c b/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
new file mode 100644
index 00000000000..109bd675a51
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memcpy-strategy-13.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic -mno-avx" } */
+/* { dg-final { scan-assembler "jmp\tmemcpy" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemcpy" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
+
+void
+foo (char *dest, char *src)
+{
+  __builtin_memcpy (dest, src, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-10.c b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
new file mode 100644
index 00000000000..685d6e5a5c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-10.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic -mno-avx" } */
+/* { dg-final { scan-assembler "jmp\tmemset" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler "call\tmemset" { target ia32 } } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 257);
+}
diff --git a/gcc/testsuite/gcc.target/i386/memset-strategy-11.c b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
new file mode 100644
index 00000000000..61ee463a8cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/memset-strategy-11.c
@@ -0,0 +1,9 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=generic -mno-sse" } */
+/* { dg-final { scan-assembler "rep stosb" } } */
+
+void
+foo (char *dest)
+{
+  __builtin_memset (dest, 0, 253);
+}
diff --git a/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c b/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
index 94dadd6cdbd..44fe7d2836e 100644
--- a/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
+++ b/gcc/testsuite/gcc.target/i386/shrink_wrap_1.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile { target { ! ia32 } } } */
-/* { dg-options "-O2 -fdump-rtl-pro_and_epilogue" } */
+/* { dg-options "-O2 -mmemset-strategy=rep_8byte:-1:align -fdump-rtl-pro_and_epilogue" } */
 
 enum machine_mode
 {
diff --git a/gcc/testsuite/gcc.target/i386/sw-1.c b/gcc/testsuite/gcc.target/i386/sw-1.c
index a9c89fca4ec..234db0e67c2 100644
--- a/gcc/testsuite/gcc.target/i386/sw-1.c
+++ b/gcc/testsuite/gcc.target/i386/sw-1.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O2 -mtune=generic -fshrink-wrap -fdump-rtl-pro_and_epilogue" } */
+/* { dg-options "-O2 -mtune=generic -mstringop-strategy=rep_byte -fshrink-wrap -fdump-rtl-pro_and_epilogue" } */
 /* { dg-additional-options "-mno-avx" { target ia32 } } */
 /* { dg-skip-if "No shrink-wrapping preformed" { x86_64-*-mingw* } } */