diff mbox series

Add scatter/gather costs

Message ID 20171025191909.GA89979@kam.mff.cuni.cz
State New
Headers show
Series Add scatter/gather costs | expand

Commit Message

Jan Hubicka Oct. 25, 2017, 7:19 p.m. UTC
Hi,
this patch adds computation of scatter/gather to i386 cost metric.
The costs for core are set for haswell, skylake has better implementation
so I will have to split the cost tables for cores older and younger than
skylake. I will do that as a followup.

Bootstrapped/regtested x86_64-linux, comitted.

Honza

	* i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather
	cost correctly.
	* i386.h (processor_costs): Add gather_static, gather_per_elt,
	scatter_static, scatter_per_elt.
	* x86-tune-costs.h: Add new cost entries.

Comments

Kumar, Venkataramanan Oct. 26, 2017, 6:48 a.m. UTC | #1
Hi Honza, 

> -----Original Message-----
> From: gcc-patches-owner@gcc.gnu.org [mailto:gcc-patches-
> owner@gcc.gnu.org] On Behalf Of Jan Hubicka
> Sent: Thursday, October 26, 2017 12:49 AM
> To: gcc-patches@gcc.gnu.org
> Subject: Add scatter/gather costs
> 
> Hi,
> this patch adds computation of scatter/gather to i386 cost metric.
> The costs for core are set for haswell, skylake has better implementation so I
> will have to split the cost tables for cores older and younger than skylake. I
> will do that as a followup.
> 
> Bootstrapped/regtested x86_64-linux, comitted.
> 
> Honza
> 
> 	* i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather
> 	cost correctly.
> 	* i386.h (processor_costs): Add gather_static, gather_per_elt,
> 	scatter_static, scatter_per_elt.
> 	* x86-tune-costs.h: Add new cost entries.
> Index: config/i386/i386.c
> ==========================================================
> =========
> --- config/i386/i386.c	(revision 254073)
> +++ config/i386/i386.c	(working copy)
> @@ -44490,7 +44490,6 @@ ix86_builtin_vectorization_cost (enum ve
>        /* We should have separate costs for unaligned loads and gather/scatter.
>  	 Do that incrementally.  */
>        case unaligned_load:
> -      case vector_gather_load:
>  	index = sse_store_index (mode);
>          return ix86_vec_cost (mode,
>  			      COSTS_N_INSNS
> @@ -44498,13 +44497,28 @@ ix86_builtin_vectorization_cost (enum ve
>  			      true);
> 
>        case unaligned_store:
> -      case vector_scatter_store:
>  	index = sse_store_index (mode);
>          return ix86_vec_cost (mode,
>  			      COSTS_N_INSNS
>  				 (ix86_cost->sse_unaligned_store[index]) / 2,
>  			      true);
> 
> +      case vector_gather_load:
> +        return ix86_vec_cost (mode,
> +			      COSTS_N_INSNS
> +				 (ix86_cost->gather_static
> +				  + ix86_cost->gather_per_elt
> +				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
> +			      true);
> +
> +      case vector_scatter_store:
> +        return ix86_vec_cost (mode,
> +			      COSTS_N_INSNS
> +				 (ix86_cost->scatter_static
> +				  + ix86_cost->scatter_per_elt
> +				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
> +			      true);
> +
>        case cond_branch_taken:
>          return ix86_cost->cond_taken_branch_cost;
> 
> Index: config/i386/i386.h
> ==========================================================
> =========
> --- config/i386/i386.h	(revision 254073)
> +++ config/i386/i386.h	(working copy)
> @@ -253,6 +253,10 @@ struct processor_costs {
>    const int mmxsse_to_integer;	/* cost of moving mmxsse register to
>  				   integer.  */
>    const int ssemmx_to_integer;  /* cost of moving integer to mmxsse
> register. */
> +  const int gather_static, gather_per_elt; /* Cost of gather load is computed
> +				   as static + per_item * nelts. */
> +  const int scatter_static, scatter_per_elt; /* Cost of gather store is
> +				   computed as static + per_item * nelts.  */
>    const int l1_cache_size;	/* size of l1 cache, in kilobytes.  */
>    const int l2_cache_size;	/* size of l2 cache, in kilobytes.  */
>    const int prefetch_block;	/* bytes moved to cache for prefetch.  */
> Index: config/i386/x86-tune-costs.h
> ==========================================================
> =========
> --- config/i386/x86-tune-costs.h	(revision 254073)
> +++ config/i386/x86-tune-costs.h	(working copy)
> @@ -82,6 +82,8 @@ struct processor_costs ix86_size_cost =
>    {3, 3, 3, 3, 3},				/* cost of unaligned SSE store
>  					   in 128bit, 256bit and 512bit */
>    3, 3,					/* SSE->integer and integer->SSE
> moves */
> +  5, 0,					/* Gather load static, per_elt.  */
> +  5, 0,					/* Gather store static, per_elt.  */
>    0,					/* size of l1 cache  */
>    0,					/* size of l2 cache  */
>    0,					/* size of prefetch block */
> @@ -166,6 +168,8 @@ struct processor_costs i386_cost = {	/*
>  					   in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
>    3, 3,					/* SSE->integer and integer->SSE
> moves */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    0,					/* size of l1 cache  */
>    0,					/* size of l2 cache  */
>    0,					/* size of prefetch block */
> @@ -249,6 +253,8 @@ struct processor_costs i486_cost = {	/*
>  					   in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
>    3, 3,					/* SSE->integer and integer->SSE
> moves */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    4,					/* size of l1 cache.  486 has 8kB cache
>  					   shared for code and data, so 4kB is
>  					   not really precise.  */
> @@ -334,6 +340,8 @@ struct processor_costs pentium_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
>    3, 3,					/* SSE->integer and integer->SSE
> moves */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    8,					/* size of l1 cache.  */
>    8,					/* size of l2 cache  */
>    0,					/* size of prefetch block */
> @@ -410,6 +418,8 @@ struct processor_costs lakemont_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
>    3, 3,					/* SSE->integer and integer->SSE
> moves */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    8,					/* size of l1 cache.  */
>    8,					/* size of l2 cache  */
>    0,					/* size of prefetch block */
> @@ -501,6 +511,8 @@ struct processor_costs pentiumpro_cost =
>  					   in 32,64,128,256 and 512-bit */
>    {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
>    3, 3,					/* SSE->integer and integer->SSE
> moves */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    8,					/* size of l1 cache.  */
>    256,					/* size of l2 cache  */
>    32,					/* size of prefetch block */
> @@ -584,6 +596,8 @@ struct processor_costs geode_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
>    6, 6,					/* SSE->integer and integer->SSE
> moves */
> +  2, 2,					/* Gather load static, per_elt.  */
> +  2, 2,					/* Gather store static, per_elt.  */
>    64,					/* size of l1 cache.  */
>    128,					/* size of l2 cache.  */
>    32,					/* size of prefetch block */
> @@ -666,6 +680,8 @@ struct processor_costs k6_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
>    6, 6,					/* SSE->integer and integer->SSE
> moves */
> +  2, 2,					/* Gather load static, per_elt.  */
> +  2, 2,					/* Gather store static, per_elt.  */
>    32,					/* size of l1 cache.  */
>    32,					/* size of l2 cache.  Some models
>  					   have integrated l2 cache, but
> @@ -754,6 +770,8 @@ struct processor_costs athlon_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
>    5, 5,					/* SSE->integer and integer->SSE
> moves */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    64,					/* size of l1 cache.  */
>    256,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -844,6 +862,8 @@ struct processor_costs k8_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
>    5, 5,					/* SSE->integer and integer->SSE
> moves */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    64,					/* size of l1 cache.  */
>    512,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -946,6 +966,8 @@ struct processor_costs amdfam10_cost = {
>  							       1/1  1/1
>  					    MOVD reg32, xmmreg Double
> FADD 3
>  							       1/1  1/1 */
> +  4, 4,					/* Gather load static, per_elt.  */
> +  4, 4,					/* Gather store static, per_elt.  */
>    64,					/* size of l1 cache.  */
>    512,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1041,6 +1063,8 @@ const struct processor_costs bdver1_cost
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
>    16, 20,				/* SSE->integer and integer->SSE
> moves */
> +  12, 12,				/* Gather load static, per_elt.  */
> +  10, 10,				/* Gather store static, per_elt.  */
>    16,					/* size of l1 cache.  */
>    2048,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1138,6 +1162,8 @@ const struct processor_costs bdver2_cost
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
>    16, 20,				/* SSE->integer and integer->SSE
> moves */
> +  12, 12,				/* Gather load static, per_elt.  */
> +  10, 10,				/* Gather store static, per_elt.  */
>    16,					/* size of l1 cache.  */
>    2048,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1234,6 +1260,8 @@ struct processor_costs bdver3_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
>    16, 20,				/* SSE->integer and integer->SSE
> moves */
> +  12, 12,				/* Gather load static, per_elt.  */
> +  10, 10,				/* Gather store static, per_elt.  */
>    16,					/* size of l1 cache.  */
>    2048,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1329,6 +1357,8 @@ struct processor_costs bdver4_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
>    16, 20,				/* SSE->integer and integer->SSE
> moves */
> +  12, 12,				/* Gather load static, per_elt.  */
> +  10, 10,				/* Gather store static, per_elt.  */
>    16,					/* size of l1 cache.  */
>    2048,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1435,6 +1465,11 @@ struct processor_costs znver1_cost = {
>  					   in 32,64,128,256 and 512-bit.  */
>    {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
>    6, 6,					/* SSE->integer and integer->SSE
> moves.  */
> +  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> +     throughput 12.  Approx 9 uops do not depend on vector size and every
> load
> +     is 7 uops.  */
> +  18, 8,				/* Gather load static, per_elt.  */
> +  18, 10,				/* Gather store static, per_elt.  */

Can you please help on how you arrived at 18 for the load/store static cost (based on throughput)?
Per_elt is 8  i.e. (latency of load ) 4 * 2 (reg-reg move ) ?
 

>    32,					/* size of l1 cache.  */
>    512,					/* size of l2 cache.  */
>    64,					/* size of prefetch block.  */
> @@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
>    14, 14,				/* SSE->integer and integer->SSE
> moves */
> +  10, 10,				/* Gather load static, per_elt.  */
> +  10, 10,				/* Gather store static, per_elt.  */
>    32,					/* size of l1 cache.  */
>    512,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
>    14, 14,				/* SSE->integer and integer->SSE
> moves */
> +  10, 10,				/* Gather load static, per_elt.  */
> +  10, 10,				/* Gather store static, per_elt.  */
>    32,					/* size of l1 cache.  */
>    2048,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
>    20, 12,				/* SSE->integer and integer->SSE
> moves */
> +  16, 16,				/* Gather load static, per_elt.  */
> +  16, 16,				/* Gather store static, per_elt.  */
>    8,					/* size of l1 cache.  */
>    256,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
>    20, 12,				/* SSE->integer and integer->SSE
> moves */
> +  12, 12,				/* Gather load static, per_elt.  */
> +  12, 12,				/* Gather store static, per_elt.  */
>    8,					/* size of l1 cache.  */
>    1024,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
>    8, 6,					/* SSE->integer and integer->SSE
> moves */
> +  8, 8,					/* Gather load static, per_elt.  */
> +  8, 8,					/* Gather store static, per_elt.  */
>    32,					/* size of l1 cache.  */
>    256,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
>    8, 6,					/* SSE->integer and integer->SSE
> moves */
> +  8, 8,					/* Gather load static, per_elt.  */
> +  8, 8,					/* Gather store static, per_elt.  */
>    32,					/* size of l1 cache.  */
>    256,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
>    4, 4,					/* SSE->integer and integer->SSE
> moves */
> +  6, 6,					/* Gather load static, per_elt.  */
> +  6, 6,					/* Gather store static, per_elt.  */
>    32,					/* size of l1 cache.  */
>    256,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {10, 10, 10, 15, 20},			/* cost of unaligned storess.  */
>    20, 20,				/* SSE->integer and integer->SSE
> moves */
> +  6, 6,					/* Gather load static, per_elt.  */
> +  6, 6,					/* Gather store static, per_elt.  */
>    32,					/* size of l1 cache.  */
>    512,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */
> @@ -2239,6 +2290,11 @@ struct processor_costs core_cost = {
>  					   in 32,64,128,256 and 512-bit */
>    {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
>    2, 2,					/* SSE->integer and integer->SSE
> moves */
> +  /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> +     rec. throughput 6.
> +     So 5 uops statically and one uops per load.  */
> +  10, 6,				/* Gather load static, per_elt.  */
> +  10, 6,				/* Gather store static, per_elt.  */
>    64,					/* size of l1 cache.  */
>    512,					/* size of l2 cache.  */
>    64,					/* size of prefetch block */

Regards,
Venkat.
Jan Hubicka Oct. 26, 2017, 7:34 a.m. UTC | #2
> Hi Honza, 
> 
> > +  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> > +     throughput 12.  Approx 9 uops do not depend on vector size and every
> > load
> > +     is 7 uops.  */
> > +  18, 8,				/* Gather load static, per_elt.  */
> > +  18, 10,				/* Gather store static, per_elt.  */
> 
> Can you please help on how you arrived at 18 for the load/store static cost (based on throughput)?
> Per_elt is 8  i.e. (latency of load ) 4 * 2 (reg-reg move ) ?

From the number of uops it seemed that gather is roughly 9+7*n where n is number of
entries. reg-reg move is 2, so 18 is 9*2.  I think we need to account that CPU
is indeed doing n independent load operations (so it does not save anything compared
to scalar code) and bit more.  Load cost is set to 6 (perhaps it should be 8 for
integer and more for FP?). So I went for 8 to make it bit more expensive.

I plan to experiment with the values incrementally so any suggestions are welcome.
Honza
>  
> 
> >    32,					/* size of l1 cache.  */
> >    512,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block.  */
> > @@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost
> >  					   in 32,64,128,256 and 512-bit */
> >    {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
> >    14, 14,				/* SSE->integer and integer->SSE
> > moves */
> > +  10, 10,				/* Gather load static, per_elt.  */
> > +  10, 10,				/* Gather store static, per_elt.  */
> >    32,					/* size of l1 cache.  */
> >    512,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost
> >  					   in 32,64,128,256 and 512-bit */
> >    {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
> >    14, 14,				/* SSE->integer and integer->SSE
> > moves */
> > +  10, 10,				/* Gather load static, per_elt.  */
> > +  10, 10,				/* Gather store static, per_elt.  */
> >    32,					/* size of l1 cache.  */
> >    2048,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = {
> >  					   in 32,64,128,256 and 512-bit */
> >    {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
> >    20, 12,				/* SSE->integer and integer->SSE
> > moves */
> > +  16, 16,				/* Gather load static, per_elt.  */
> > +  16, 16,				/* Gather store static, per_elt.  */
> >    8,					/* size of l1 cache.  */
> >    256,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = {
> >  					   in 32,64,128,256 and 512-bit */
> >    {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
> >    20, 12,				/* SSE->integer and integer->SSE
> > moves */
> > +  12, 12,				/* Gather load static, per_elt.  */
> > +  12, 12,				/* Gather store static, per_elt.  */
> >    8,					/* size of l1 cache.  */
> >    1024,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = {
> >  					   in 32,64,128,256 and 512-bit */
> >    {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
> >    8, 6,					/* SSE->integer and integer->SSE
> > moves */
> > +  8, 8,					/* Gather load static, per_elt.  */
> > +  8, 8,					/* Gather store static, per_elt.  */
> >    32,					/* size of l1 cache.  */
> >    256,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = {
> >  					   in 32,64,128,256 and 512-bit */
> >    {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
> >    8, 6,					/* SSE->integer and integer->SSE
> > moves */
> > +  8, 8,					/* Gather load static, per_elt.  */
> > +  8, 8,					/* Gather store static, per_elt.  */
> >    32,					/* size of l1 cache.  */
> >    256,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = {
> >  					   in 32,64,128,256 and 512-bit */
> >    {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
> >    4, 4,					/* SSE->integer and integer->SSE
> > moves */
> > +  6, 6,					/* Gather load static, per_elt.  */
> > +  6, 6,					/* Gather store static, per_elt.  */
> >    32,					/* size of l1 cache.  */
> >    256,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = {
> >  					   in 32,64,128,256 and 512-bit */
> >    {10, 10, 10, 15, 20},			/* cost of unaligned storess.  */
> >    20, 20,				/* SSE->integer and integer->SSE
> > moves */
> > +  6, 6,					/* Gather load static, per_elt.  */
> > +  6, 6,					/* Gather store static, per_elt.  */
> >    32,					/* size of l1 cache.  */
> >    512,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> > @@ -2239,6 +2290,11 @@ struct processor_costs core_cost = {
> >  					   in 32,64,128,256 and 512-bit */
> >    {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
> >    2, 2,					/* SSE->integer and integer->SSE
> > moves */
> > +  /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
> > +     rec. throughput 6.
> > +     So 5 uops statically and one uops per load.  */
> > +  10, 6,				/* Gather load static, per_elt.  */
> > +  10, 6,				/* Gather store static, per_elt.  */
> >    64,					/* size of l1 cache.  */
> >    512,					/* size of l2 cache.  */
> >    64,					/* size of prefetch block */
> 
> Regards,
> Venkat.
diff mbox series

Patch

Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 254073)
+++ config/i386/i386.c	(working copy)
@@ -44490,7 +44490,6 @@  ix86_builtin_vectorization_cost (enum ve
       /* We should have separate costs for unaligned loads and gather/scatter.
 	 Do that incrementally.  */
       case unaligned_load:
-      case vector_gather_load:
 	index = sse_store_index (mode);
         return ix86_vec_cost (mode,
 			      COSTS_N_INSNS
@@ -44498,13 +44497,28 @@  ix86_builtin_vectorization_cost (enum ve
 			      true);
 
       case unaligned_store:
-      case vector_scatter_store:
 	index = sse_store_index (mode);
         return ix86_vec_cost (mode,
 			      COSTS_N_INSNS
 				 (ix86_cost->sse_unaligned_store[index]) / 2,
 			      true);
 
+      case vector_gather_load:
+        return ix86_vec_cost (mode,
+			      COSTS_N_INSNS
+				 (ix86_cost->gather_static
+				  + ix86_cost->gather_per_elt
+				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
+			      true);
+
+      case vector_scatter_store:
+        return ix86_vec_cost (mode,
+			      COSTS_N_INSNS
+				 (ix86_cost->scatter_static
+				  + ix86_cost->scatter_per_elt
+				    * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
+			      true);
+
       case cond_branch_taken:
         return ix86_cost->cond_taken_branch_cost;
 
Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h	(revision 254073)
+++ config/i386/i386.h	(working copy)
@@ -253,6 +253,10 @@  struct processor_costs {
   const int mmxsse_to_integer;	/* cost of moving mmxsse register to
 				   integer.  */
   const int ssemmx_to_integer;  /* cost of moving integer to mmxsse register. */
+  const int gather_static, gather_per_elt; /* Cost of gather load is computed
+				   as static + per_item * nelts. */
+  const int scatter_static, scatter_per_elt; /* Cost of gather store is
+				   computed as static + per_item * nelts.  */
   const int l1_cache_size;	/* size of l1 cache, in kilobytes.  */
   const int l2_cache_size;	/* size of l2 cache, in kilobytes.  */
   const int prefetch_block;	/* bytes moved to cache for prefetch.  */
Index: config/i386/x86-tune-costs.h
===================================================================
--- config/i386/x86-tune-costs.h	(revision 254073)
+++ config/i386/x86-tune-costs.h	(working copy)
@@ -82,6 +82,8 @@  struct processor_costs ix86_size_cost =
   {3, 3, 3, 3, 3},				/* cost of unaligned SSE store
 					   in 128bit, 256bit and 512bit */
   3, 3,					/* SSE->integer and integer->SSE moves */
+  5, 0,					/* Gather load static, per_elt.  */
+  5, 0,					/* Gather store static, per_elt.  */
   0,					/* size of l1 cache  */
   0,					/* size of l2 cache  */
   0,					/* size of prefetch block */
@@ -166,6 +168,8 @@  struct processor_costs i386_cost = {	/*
 					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
   3, 3,					/* SSE->integer and integer->SSE moves */
+  4, 4,					/* Gather load static, per_elt.  */
+  4, 4,					/* Gather store static, per_elt.  */
   0,					/* size of l1 cache  */
   0,					/* size of l2 cache  */
   0,					/* size of prefetch block */
@@ -249,6 +253,8 @@  struct processor_costs i486_cost = {	/*
 					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
   3, 3,					/* SSE->integer and integer->SSE moves */
+  4, 4,					/* Gather load static, per_elt.  */
+  4, 4,					/* Gather store static, per_elt.  */
   4,					/* size of l1 cache.  486 has 8kB cache
 					   shared for code and data, so 4kB is
 					   not really precise.  */
@@ -334,6 +340,8 @@  struct processor_costs pentium_cost = {
 					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
   3, 3,					/* SSE->integer and integer->SSE moves */
+  4, 4,					/* Gather load static, per_elt.  */
+  4, 4,					/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
   8,					/* size of l2 cache  */
   0,					/* size of prefetch block */
@@ -410,6 +418,8 @@  struct processor_costs lakemont_cost = {
 					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
   3, 3,					/* SSE->integer and integer->SSE moves */
+  4, 4,					/* Gather load static, per_elt.  */
+  4, 4,					/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
   8,					/* size of l2 cache  */
   0,					/* size of prefetch block */
@@ -501,6 +511,8 @@  struct processor_costs pentiumpro_cost =
 					   in 32,64,128,256 and 512-bit */
   {4, 8, 16, 32, 64},			/* cost of unaligned stores.  */
   3, 3,					/* SSE->integer and integer->SSE moves */
+  4, 4,					/* Gather load static, per_elt.  */
+  4, 4,					/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
   256,					/* size of l2 cache  */
   32,					/* size of prefetch block */
@@ -584,6 +596,8 @@  struct processor_costs geode_cost = {
 					   in 32,64,128,256 and 512-bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
   6, 6,					/* SSE->integer and integer->SSE moves */
+  2, 2,					/* Gather load static, per_elt.  */
+  2, 2,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
   128,					/* size of l2 cache.  */
   32,					/* size of prefetch block */
@@ -666,6 +680,8 @@  struct processor_costs k6_cost = {
 					   in 32,64,128,256 and 512-bit */
   {2, 2, 8, 16, 32},			/* cost of unaligned stores.  */
   6, 6,					/* SSE->integer and integer->SSE moves */
+  2, 2,					/* Gather load static, per_elt.  */
+  2, 2,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
   32,					/* size of l2 cache.  Some models
 					   have integrated l2 cache, but
@@ -754,6 +770,8 @@  struct processor_costs athlon_cost = {
 					   in 32,64,128,256 and 512-bit */
   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
   5, 5,					/* SSE->integer and integer->SSE moves */
+  4, 4,					/* Gather load static, per_elt.  */
+  4, 4,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
   256,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -844,6 +862,8 @@  struct processor_costs k8_cost = {
 					   in 32,64,128,256 and 512-bit */
   {4, 4, 5, 10, 20},			/* cost of unaligned stores.  */
   5, 5,					/* SSE->integer and integer->SSE moves */
+  4, 4,					/* Gather load static, per_elt.  */
+  4, 4,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
   512,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -946,6 +966,8 @@  struct processor_costs amdfam10_cost = {
 							       1/1  1/1
 					    MOVD reg32, xmmreg Double FADD 3
 							       1/1  1/1 */
+  4, 4,					/* Gather load static, per_elt.  */
+  4, 4,					/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
   512,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1041,6 +1063,8 @@  const struct processor_costs bdver1_cost
 					   in 32,64,128,256 and 512-bit */
   {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
   16, 20,				/* SSE->integer and integer->SSE moves */
+  12, 12,				/* Gather load static, per_elt.  */
+  10, 10,				/* Gather store static, per_elt.  */
   16,					/* size of l1 cache.  */
   2048,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1138,6 +1162,8 @@  const struct processor_costs bdver2_cost
 					   in 32,64,128,256 and 512-bit */
   {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
   16, 20,				/* SSE->integer and integer->SSE moves */
+  12, 12,				/* Gather load static, per_elt.  */
+  10, 10,				/* Gather store static, per_elt.  */
   16,					/* size of l1 cache.  */
   2048,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1234,6 +1260,8 @@  struct processor_costs bdver3_cost = {
 					   in 32,64,128,256 and 512-bit */
   {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
   16, 20,				/* SSE->integer and integer->SSE moves */
+  12, 12,				/* Gather load static, per_elt.  */
+  10, 10,				/* Gather store static, per_elt.  */
   16,					/* size of l1 cache.  */
   2048,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1329,6 +1357,8 @@  struct processor_costs bdver4_cost = {
 					   in 32,64,128,256 and 512-bit */
   {10, 10, 10, 20, 30},			/* cost of unaligned stores.  */
   16, 20,				/* SSE->integer and integer->SSE moves */
+  12, 12,				/* Gather load static, per_elt.  */
+  10, 10,				/* Gather store static, per_elt.  */
   16,					/* size of l1 cache.  */
   2048,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1435,6 +1465,11 @@  struct processor_costs znver1_cost = {
 					   in 32,64,128,256 and 512-bit.  */
   {8, 8, 8, 8, 16},			/* cost of unaligned stores.  */
   6, 6,					/* SSE->integer and integer->SSE moves.  */
+  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
+     throughput 12.  Approx 9 uops do not depend on vector size and every load
+     is 7 uops.  */
+  18, 8,				/* Gather load static, per_elt.  */
+  18, 10,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
   512,					/* size of l2 cache.  */
   64,					/* size of prefetch block.  */
@@ -1539,6 +1574,8 @@  const struct processor_costs btver1_cost
 					   in 32,64,128,256 and 512-bit */
   {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
   14, 14,				/* SSE->integer and integer->SSE moves */
+  10, 10,				/* Gather load static, per_elt.  */
+  10, 10,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
   512,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1624,6 +1661,8 @@  const struct processor_costs btver2_cost
 					   in 32,64,128,256 and 512-bit */
   {10, 10, 12, 24, 48},			/* cost of unaligned stores.  */
   14, 14,				/* SSE->integer and integer->SSE moves */
+  10, 10,				/* Gather load static, per_elt.  */
+  10, 10,				/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
   2048,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1708,6 +1747,8 @@  struct processor_costs pentium4_cost = {
 					   in 32,64,128,256 and 512-bit */
   {32, 32, 32, 64, 128},		/* cost of unaligned stores.  */
   20, 12,				/* SSE->integer and integer->SSE moves */
+  16, 16,				/* Gather load static, per_elt.  */
+  16, 16,				/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
   256,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1795,6 +1836,8 @@  struct processor_costs nocona_cost = {
 					   in 32,64,128,256 and 512-bit */
   {24, 24, 24, 48, 96},			/* cost of unaligned stores.  */
   20, 12,				/* SSE->integer and integer->SSE moves */
+  12, 12,				/* Gather load static, per_elt.  */
+  12, 12,				/* Gather store static, per_elt.  */
   8,					/* size of l1 cache.  */
   1024,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1880,6 +1923,8 @@  struct processor_costs atom_cost = {
 					   in 32,64,128,256 and 512-bit */
   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
   8, 6,					/* SSE->integer and integer->SSE moves */
+  8, 8,					/* Gather load static, per_elt.  */
+  8, 8,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
   256,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -1965,6 +2010,8 @@  struct processor_costs slm_cost = {
 					   in 32,64,128,256 and 512-bit */
   {16, 16, 16, 32, 64},			/* cost of unaligned stores.  */
   8, 6,					/* SSE->integer and integer->SSE moves */
+  8, 8,					/* Gather load static, per_elt.  */
+  8, 8,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
   256,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -2050,6 +2097,8 @@  struct processor_costs intel_cost = {
 					   in 32,64,128,256 and 512-bit */
   {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
   4, 4,					/* SSE->integer and integer->SSE moves */
+  6, 6,					/* Gather load static, per_elt.  */
+  6, 6,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
   256,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -2142,6 +2191,8 @@  struct processor_costs generic_cost = {
 					   in 32,64,128,256 and 512-bit */
   {10, 10, 10, 15, 20},			/* cost of unaligned storess.  */
   20, 20,				/* SSE->integer and integer->SSE moves */
+  6, 6,					/* Gather load static, per_elt.  */
+  6, 6,					/* Gather store static, per_elt.  */
   32,					/* size of l1 cache.  */
   512,					/* size of l2 cache.  */
   64,					/* size of prefetch block */
@@ -2239,6 +2290,11 @@  struct processor_costs core_cost = {
 					   in 32,64,128,256 and 512-bit */
   {6, 6, 6, 6, 12},			/* cost of unaligned stores.  */
   2, 2,					/* SSE->integer and integer->SSE moves */
+  /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
+     rec. throughput 6.
+     So 5 uops statically and one uops per load.  */
+  10, 6,				/* Gather load static, per_elt.  */
+  10, 6,				/* Gather store static, per_elt.  */
   64,					/* size of l1 cache.  */
   512,					/* size of l2 cache.  */
   64,					/* size of prefetch block */