diff mbox series

Update x86-tune-costs.h for znver2

Message ID 20190723092903.52hkmllrkaeea2v4@kam.mff.cuni.cz
State New
Headers show
Series Update x86-tune-costs.h for znver2 | expand

Commit Message

Jan Hubicka July 23, 2019, 9:29 a.m. UTC
Hi,
this patch updates znver2 costs to match reality.  In particular we
re-benchmarked memcpy strategies and it looks that glibc now wins even
for relatively small blocks. 
Moreover I updated costs of moves to reflect that znver2 has 256 vector
paths and faster multiplication.

Bootstrapped/regtested x86_64-linux, comitted.

Honza

	* x86-tune-costs.h (znver2_memcpy): Update.
	(znver2_costs): Update 256 bit SSE costs and multiplication.

Comments

Jan Hubicka July 30, 2019, 8:09 a.m. UTC | #1
> Hi,
> this patch updates znver2 costs to match reality.  In particular we
> re-benchmarked memcpy strategies and it looks that glibc now wins even
> for relatively small blocks. 
> Moreover I updated costs of moves to reflect that znver2 has 256 vector
> paths and faster multiplication.
> 
> Bootstrapped/regtested x86_64-linux, comitted.
> 
> Honza
> 
> 	* x86-tune-costs.h (znver2_memcpy): Update.
> 	(znver2_costs): Update 256 bit SSE costs and multiplication.

Hello,
I have now backported the patch to gcc 9 branch.

Honza
> Index: config/i386/x86-tune-costs.h
> ===================================================================
> --- config/i386/x86-tune-costs.h	(revision 273727)
> +++ config/i386/x86-tune-costs.h	(working copy)
> @@ -1279,12 +1279,12 @@ struct processor_costs znver1_cost = {
>  static stringop_algs znver2_memcpy[2] = {
>    {libcall, {{6, loop, false}, {14, unrolled_loop, false},
>  	     {-1, rep_prefix_4_byte, false}}},
> -  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
> +  {libcall, {{16, loop, false}, {64, rep_prefix_4_byte, false},
>  	     {-1, libcall, false}}}};
>  static stringop_algs znver2_memset[2] = {
>    {libcall, {{8, loop, false}, {24, unrolled_loop, false},
>  	     {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
> -  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
> +  {libcall, {{24, rep_prefix_4_byte, false}, {128, rep_prefix_8_byte, false},
>  	     {-1, libcall, false}}}};
>  
>  struct processor_costs znver2_cost = {
> @@ -1335,11 +1335,11 @@ struct processor_costs znver2_cost = {
>  					   in SImode and DImode.  */
>    {8, 8},				/* cost of storing MMX registers
>  					   in SImode and DImode.  */
> -  2, 3, 6,				/* cost of moving XMM,YMM,ZMM
> +  2, 2, 3,				/* cost of moving XMM,YMM,ZMM
>  					   register.  */
> -  {6, 6, 6, 10, 20},			/* cost of loading SSE registers
> +  {6, 6, 6, 6, 12},			/* cost of loading SSE registers
>  					   in 32,64,128,256 and 512-bit.  */
> -  {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
> +  {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
>    {8, 8, 8, 8, 16},			/* cost of storing SSE registers
>  					   in 32,64,128,256 and 512-bit.  */
>    {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
> @@ -1372,7 +1372,7 @@ struct processor_costs znver2_cost = {
>    COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
>    COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
>    COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
> -  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
> +  COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
>    COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
>    COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
>    COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
Richard Biener July 30, 2019, 8:42 a.m. UTC | #2
On Tue, Jul 30, 2019 at 10:09 AM Jan Hubicka <hubicka@ucw.cz> wrote:
>
> > Hi,
> > this patch updates znver2 costs to match reality.  In particular we
> > re-benchmarked memcpy strategies and it looks that glibc now wins even
> > for relatively small blocks.
> > Moreover I updated costs of moves to reflect that znver2 has 256 vector
> > paths and faster multiplication.
> >
> > Bootstrapped/regtested x86_64-linux, comitted.
> >
> > Honza
> >
> >       * x86-tune-costs.h (znver2_memcpy): Update.
> >       (znver2_costs): Update 256 bit SSE costs and multiplication.
>
> Hello,
> I have now backported the patch to gcc 9 branch.

Thanks - can you please update changes.html for it in the 9.2 section?

Richard.

> Honza
> > Index: config/i386/x86-tune-costs.h
> > ===================================================================
> > --- config/i386/x86-tune-costs.h      (revision 273727)
> > +++ config/i386/x86-tune-costs.h      (working copy)
> > @@ -1279,12 +1279,12 @@ struct processor_costs znver1_cost = {
> >  static stringop_algs znver2_memcpy[2] = {
> >    {libcall, {{6, loop, false}, {14, unrolled_loop, false},
> >            {-1, rep_prefix_4_byte, false}}},
> > -  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
> > +  {libcall, {{16, loop, false}, {64, rep_prefix_4_byte, false},
> >            {-1, libcall, false}}}};
> >  static stringop_algs znver2_memset[2] = {
> >    {libcall, {{8, loop, false}, {24, unrolled_loop, false},
> >            {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
> > -  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
> > +  {libcall, {{24, rep_prefix_4_byte, false}, {128, rep_prefix_8_byte, false},
> >            {-1, libcall, false}}}};
> >
> >  struct processor_costs znver2_cost = {
> > @@ -1335,11 +1335,11 @@ struct processor_costs znver2_cost = {
> >                                          in SImode and DImode.  */
> >    {8, 8},                            /* cost of storing MMX registers
> >                                          in SImode and DImode.  */
> > -  2, 3, 6,                           /* cost of moving XMM,YMM,ZMM
> > +  2, 2, 3,                           /* cost of moving XMM,YMM,ZMM
> >                                          register.  */
> > -  {6, 6, 6, 10, 20},                 /* cost of loading SSE registers
> > +  {6, 6, 6, 6, 12},                  /* cost of loading SSE registers
> >                                          in 32,64,128,256 and 512-bit.  */
> > -  {6, 6, 6, 10, 20},                 /* cost of unaligned loads.  */
> > +  {6, 6, 6, 6, 12},                  /* cost of unaligned loads.  */
> >    {8, 8, 8, 8, 16},                  /* cost of storing SSE registers
> >                                          in 32,64,128,256 and 512-bit.  */
> >    {8, 8, 8, 8, 16},                  /* cost of unaligned stores.  */
> > @@ -1372,7 +1372,7 @@ struct processor_costs znver2_cost = {
> >    COSTS_N_INSNS (1),                 /* cost of cheap SSE instruction.  */
> >    COSTS_N_INSNS (3),                 /* cost of ADDSS/SD SUBSS/SD insns.  */
> >    COSTS_N_INSNS (3),                 /* cost of MULSS instruction.  */
> > -  COSTS_N_INSNS (4),                 /* cost of MULSD instruction.  */
> > +  COSTS_N_INSNS (3),                 /* cost of MULSD instruction.  */
> >    COSTS_N_INSNS (5),                 /* cost of FMA SS instruction.  */
> >    COSTS_N_INSNS (5),                 /* cost of FMA SD instruction.  */
> >    COSTS_N_INSNS (10),                        /* cost of DIVSS instruction.  */
Jan Hubicka July 30, 2019, 9:33 a.m. UTC | #3
> On Tue, Jul 30, 2019 at 10:09 AM Jan Hubicka <hubicka@ucw.cz> wrote:
> >
> > > Hi,
> > > this patch updates znver2 costs to match reality.  In particular we
> > > re-benchmarked memcpy strategies and it looks that glibc now wins even
> > > for relatively small blocks.
> > > Moreover I updated costs of moves to reflect that znver2 has 256 vector
> > > paths and faster multiplication.
> > >
> > > Bootstrapped/regtested x86_64-linux, comitted.
> > >
> > > Honza
> > >
> > >       * x86-tune-costs.h (znver2_memcpy): Update.
> > >       (znver2_costs): Update 256 bit SSE costs and multiplication.
> >
> > Hello,
> > I have now backported the patch to gcc 9 branch.
> 
> Thanks - can you please update changes.html for it in the 9.2 section?

There seems to be no GCC 9.2 section yet.

Index: gcc-9/changes.html
===================================================================
RCS file: /cvs/gcc/wwwdocs/htdocs/gcc-9/changes.html,v
retrieving revision 1.72
diff -c -3 -p -r1.72 changes.html
*** gcc-9/changes.html	12 Jul 2019 15:55:50 -0000	1.72
--- gcc-9/changes.html	30 Jul 2019 09:32:17 -0000
*************** complete (that is, it is possible that s
*** 1095,1099 ****
--- 1095,1105 ----
  are not listed here).</p>
  
  <!-- .................................................................. -->
+ <h2 id="GCC9.2">GCC 9.2</h2>
+ <ul>
+   <li>IA-32/x86-64 backend tuning for <code>znver2</code> was improved based on benchmarks on real hardware.</li>
+ </uL>
+ 
+ <!-- .................................................................. -->
  </body>
  </html>
Richard Biener July 30, 2019, 9:53 a.m. UTC | #4
On Tue, Jul 30, 2019 at 11:33 AM Jan Hubicka <hubicka@ucw.cz> wrote:
>
> > On Tue, Jul 30, 2019 at 10:09 AM Jan Hubicka <hubicka@ucw.cz> wrote:
> > >
> > > > Hi,
> > > > this patch updates znver2 costs to match reality.  In particular we
> > > > re-benchmarked memcpy strategies and it looks that glibc now wins even
> > > > for relatively small blocks.
> > > > Moreover I updated costs of moves to reflect that znver2 has 256 vector
> > > > paths and faster multiplication.
> > > >
> > > > Bootstrapped/regtested x86_64-linux, comitted.
> > > >
> > > > Honza
> > > >
> > > >       * x86-tune-costs.h (znver2_memcpy): Update.
> > > >       (znver2_costs): Update 256 bit SSE costs and multiplication.
> > >
> > > Hello,
> > > I have now backported the patch to gcc 9 branch.
> >
> > Thanks - can you please update changes.html for it in the 9.2 section?
>
> There seems to be no GCC 9.2 section yet.

Yes.  Looks good to me btw.

Richard.

> Index: gcc-9/changes.html
> ===================================================================
> RCS file: /cvs/gcc/wwwdocs/htdocs/gcc-9/changes.html,v
> retrieving revision 1.72
> diff -c -3 -p -r1.72 changes.html
> *** gcc-9/changes.html  12 Jul 2019 15:55:50 -0000      1.72
> --- gcc-9/changes.html  30 Jul 2019 09:32:17 -0000
> *************** complete (that is, it is possible that s
> *** 1095,1099 ****
> --- 1095,1105 ----
>   are not listed here).</p>
>
>   <!-- .................................................................. -->
> + <h2 id="GCC9.2">GCC 9.2</h2>
> + <ul>
> +   <li>IA-32/x86-64 backend tuning for <code>znver2</code> was improved based on benchmarks on real hardware.</li>
> + </uL>
> +
> + <!-- .................................................................. -->
>   </body>
>   </html>
Gerald Pfeifer Aug. 18, 2019, 8:06 a.m. UTC | #5
On Tue, 30 Jul 2019, Jan Hubicka wrote:
>> Thanks - can you please update changes.html for it in the 9.2 section?
> There seems to be no GCC 9.2 section yet.

I see one now. 

On Tue, 30 Jul 2019, Richard Biener wrote:
> Yes.  Looks good to me btw.

Same here.  (I would have taken Richard's note as approval, though
as maintainer over that area you don't even need any.)

For the benefit of the doubt, though: okay, thanks. :-)

Gerald
diff mbox series

Patch

Index: config/i386/x86-tune-costs.h
===================================================================
--- config/i386/x86-tune-costs.h	(revision 273727)
+++ config/i386/x86-tune-costs.h	(working copy)
@@ -1279,12 +1279,12 @@  struct processor_costs znver1_cost = {
 static stringop_algs znver2_memcpy[2] = {
   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 	     {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
+  {libcall, {{16, loop, false}, {64, rep_prefix_4_byte, false},
 	     {-1, libcall, false}}}};
 static stringop_algs znver2_memset[2] = {
   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 	     {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
+  {libcall, {{24, rep_prefix_4_byte, false}, {128, rep_prefix_8_byte, false},
 	     {-1, libcall, false}}}};
 
 struct processor_costs znver2_cost = {
@@ -1335,11 +1335,11 @@  struct processor_costs znver2_cost = {
 					   in SImode and DImode.  */
   {8, 8},				/* cost of storing MMX registers
 					   in SImode and DImode.  */
-  2, 3, 6,				/* cost of moving XMM,YMM,ZMM
+  2, 2, 3,				/* cost of moving XMM,YMM,ZMM
 					   register.  */
-  {6, 6, 6, 10, 20},			/* cost of loading SSE registers
+  {6, 6, 6, 6, 12},			/* cost of loading SSE registers
 					   in 32,64,128,256 and 512-bit.  */
-  {6, 6, 6, 10, 20},			/* cost of unaligned loads.  */
+  {6, 6, 6, 6, 12},			/* cost of unaligned loads.  */
   {8, 8, 8, 8, 16},			/* cost of storing SSE registers
 					   in 32,64,128,256 and 512-bit.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
@@ -1372,7 +1372,7 @@  struct processor_costs znver2_cost = {
   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
   COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
-  COSTS_N_INSNS (4),			/* cost of MULSD instruction.  */
+  COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
   COSTS_N_INSNS (5),			/* cost of FMA SS instruction.  */
   COSTS_N_INSNS (5),			/* cost of FMA SD instruction.  */
   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */