diff mbox series

[RFA] Zen tuning part 9: Add support for scatter/gather in vectorizer costmodel

Message ID 20171017133415.GC94155@kam.mff.cuni.cz
State New
Headers show
Series [RFA] Zen tuning part 9: Add support for scatter/gather in vectorizer costmodel | expand

Commit Message

Jan Hubicka Oct. 17, 2017, 1:34 p.m. UTC
Hi,
gether/scatter loads tends to be expensive (at least for x86) while we now account them
as vector loads/stores which are cheap.  This patch adds vectorizer cost entry for these
so this can be modelled more realistically.

Bootstrapped/regtested x86_64-linux, OK?

Honza

2017-10-17  Jan Hubicka  <hubicka@ucw.cz>

	* target.h (enum vect_cost_for_stmt): Add vec_gather_load and
	vec_scatter_store
	* tree-vect-stmts.c (record_stmt_cost): Make difference between normal
	and scatter/gather ops.

	* aarch64/aarch64.c (aarch64_builtin_vectorization_cost): Add
	vec_gather_load and vec_scatter_store.
	* arm/arm.c (arm_builtin_vectorization_cost): Likewise.
	* powerpcspe/powerpcspe.c (rs6000_builtin_vectorization_cost): Likewise.
	* rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Likewise.
	* s390/s390.c (s390_builtin_vectorization_cost): Likewise.
	* spu/spu.c (spu_builtin_vectorization_cost): Likewise.

Comments

Richard Biener Oct. 17, 2017, 1:41 p.m. UTC | #1
On Tue, 17 Oct 2017, Jan Hubicka wrote:

> Hi,
> gether/scatter loads tends to be expensive (at least for x86) while we now account them
> as vector loads/stores which are cheap.  This patch adds vectorizer cost entry for these
> so this can be modelled more realistically.
> 
> Bootstrapped/regtested x86_64-linux, OK?

Ok.  gather and load is somewhat redundant, likewise
scatter and store.  So you might want to change it to just
vector_gather and vector_scatter.  Even vector_ is redundant...

Best available implementations manage to hide the vector build
cost and just expose the latency of the load(s).  I wonder what
Zen does here ;)

Note the most major source of impreciseness in the cost model
is from vec_perm because we lack the information of the
permutation mask which means we can't distinguish between
cross-lane and intra-lane permutes.

Richard.

> Honza
> 
> 2017-10-17  Jan Hubicka  <hubicka@ucw.cz>
> 
> 	* target.h (enum vect_cost_for_stmt): Add vec_gather_load and
> 	vec_scatter_store
> 	* tree-vect-stmts.c (record_stmt_cost): Make difference between normal
> 	and scatter/gather ops.
> 
> 	* aarch64/aarch64.c (aarch64_builtin_vectorization_cost): Add
> 	vec_gather_load and vec_scatter_store.
> 	* arm/arm.c (arm_builtin_vectorization_cost): Likewise.
> 	* powerpcspe/powerpcspe.c (rs6000_builtin_vectorization_cost): Likewise.
> 	* rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Likewise.
> 	* s390/s390.c (s390_builtin_vectorization_cost): Likewise.
> 	* spu/spu.c (spu_builtin_vectorization_cost): Likewise.
> 
> Index: config/aarch64/aarch64.c
> ===================================================================
> --- config/aarch64/aarch64.c	(revision 253789)
> +++ config/aarch64/aarch64.c	(working copy)
> @@ -8547,9 +8547,10 @@ aarch64_builtin_vectorization_cost (enum
>  	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
>  
>        case vector_load:
> +      case vector_gather_load:
>  	return costs->vec_align_load_cost;
>  
> -      case vector_store:
> +      case vector_scatter_store:
>  	return costs->vec_store_cost;
>  
>        case vec_to_scalar:
> Index: config/arm/arm.c
> ===================================================================
> --- config/arm/arm.c	(revision 253789)
> +++ config/arm/arm.c	(working copy)
> @@ -11241,9 +11241,11 @@ arm_builtin_vectorization_cost (enum vec
>          return current_tune->vec_costs->vec_stmt_cost;
>  
>        case vector_load:
> +      case vector_gather_load:
>          return current_tune->vec_costs->vec_align_load_cost;
>  
>        case vector_store:
> +      case vector_scatter_store:
>          return current_tune->vec_costs->vec_store_cost;
>  
>        case vec_to_scalar:
> Index: config/powerpcspe/powerpcspe.c
> ===================================================================
> --- config/powerpcspe/powerpcspe.c	(revision 253789)
> +++ config/powerpcspe/powerpcspe.c	(working copy)
> @@ -5834,6 +5834,8 @@ rs6000_builtin_vectorization_cost (enum
>        case vector_stmt:
>        case vector_load:
>        case vector_store:
> +      case vector_gather_load:
> +      case vector_scatter_store:
>        case vec_to_scalar:
>        case scalar_to_vec:
>        case cond_branch_not_taken:
> Index: config/rs6000/rs6000.c
> ===================================================================
> --- config/rs6000/rs6000.c	(revision 253789)
> +++ config/rs6000/rs6000.c	(working copy)
> @@ -5398,6 +5398,8 @@ rs6000_builtin_vectorization_cost (enum
>        case vector_stmt:
>        case vector_load:
>        case vector_store:
> +      case vector_gather_load:
> +      case vector_scatter_store:
>        case vec_to_scalar:
>        case scalar_to_vec:
>        case cond_branch_not_taken:
> Index: config/s390/s390.c
> ===================================================================
> --- config/s390/s390.c	(revision 253789)
> +++ config/s390/s390.c	(working copy)
> @@ -3717,6 +3717,8 @@ s390_builtin_vectorization_cost (enum ve
>        case vector_stmt:
>        case vector_load:
>        case vector_store:
> +      case vector_gather_load:
> +      case vector_scatter_store:
>        case vec_to_scalar:
>        case scalar_to_vec:
>        case cond_branch_not_taken:
> Index: config/spu/spu.c
> ===================================================================
> --- config/spu/spu.c	(revision 253789)
> +++ config/spu/spu.c	(working copy)
> @@ -6625,6 +6625,8 @@ spu_builtin_vectorization_cost (enum vec
>        case vector_stmt:
>        case vector_load:
>        case vector_store:
> +      case vector_gather_load:
> +      case vector_scatter_store:
>        case vec_to_scalar:
>        case scalar_to_vec:
>        case cond_branch_not_taken:
> Index: target.h
> ===================================================================
> --- target.h	(revision 253789)
> +++ target.h	(working copy)
> @@ -171,9 +171,11 @@ enum vect_cost_for_stmt
>    scalar_store,
>    vector_stmt,
>    vector_load,
> +  vector_gather_load,
>    unaligned_load,
>    unaligned_store,
>    vector_store,
> +  vector_scatter_store,
>    vec_to_scalar,
>    scalar_to_vec,
>    cond_branch_not_taken,
> Index: tree-vect-stmts.c
> ===================================================================
> --- tree-vect-stmts.c	(revision 253789)
> +++ tree-vect-stmts.c	(working copy)
> @@ -95,6 +95,12 @@ record_stmt_cost (stmt_vector_for_cost *
>  		  enum vect_cost_for_stmt kind, stmt_vec_info stmt_info,
>  		  int misalign, enum vect_cost_model_location where)
>  {
> +  if ((kind == vector_load || kind == unaligned_load)
> +      && STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +    kind = vector_gather_load;
> +  if ((kind == vector_store || kind == unaligned_store)
> +      && STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +    kind = vector_scatter_store;
>    if (body_cost_vec)
>      {
>        tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
> 
>
Jan Hubicka Oct. 17, 2017, 5:22 p.m. UTC | #2
> On Tue, 17 Oct 2017, Jan Hubicka wrote:
> 
> > Hi,
> > gether/scatter loads tends to be expensive (at least for x86) while we now account them
> > as vector loads/stores which are cheap.  This patch adds vectorizer cost entry for these
> > so this can be modelled more realistically.
> > 
> > Bootstrapped/regtested x86_64-linux, OK?
> 
> Ok.  gather and load is somewhat redundant, likewise
> scatter and store.  So you might want to change it to just
> vector_gather and vector_scatter.  Even vector_ is redundant...

Hehe, comming from outside of vectorizer world, I did not know what
scatter/gather is and thus I wanted to keep load/store and vec in so it will be
easier to google for those who will need to fill in the numbers in future :)
> 
> Best available implementations manage to hide the vector build
> cost and just expose the latency of the load(s).  I wonder what
> Zen does here ;)

According to Agner's tables, gathers range from 12 ops (vgatherdpd)
to 66 ops (vpgatherdd).  I assume that CPU needs to do following:

1) transfer the offsets sse->ALU unit for address generation (3 cycles
   each, 2 ops)
2) do the address calcualtion (2 ops, probably 4 ops because it does not map naturally
			       to AGU)
2) do the load (7 cycles each, 2 ops)
3) merge results (1 ops)

so I get 7 ops, not sure what remaining 5 do.

Agner does not account time, but According to
http://users.atw.hu/instlatx64/AuthenticAMD0800F11_K17_Zen_InstLatX64.txt the
gather time ranges from 14 cycles (vgatherpd) to 20 cycles.  Here I guess it is
3+1+7+1=12 so it seems to work.

If you implement gather by hand, you save the SSE->address caluclation path and
thus you can get faster.
> 
> Note the most major source of impreciseness in the cost model
> is from vec_perm because we lack the information of the
> permutation mask which means we can't distinguish between
> cross-lane and intra-lane permutes.

Besides that we lack information about what operation we do (addition
or division?) which may be useful to pass down, especially because we do
have relevant information handy in the x86_cost tables.  So I am thinking
of adding extra parameter to the hook telling the operation.
What info we need to pass for permutations?

Honza
> 
> Richard.
> 
> > Honza
> > 
> > 2017-10-17  Jan Hubicka  <hubicka@ucw.cz>
> > 
> > 	* target.h (enum vect_cost_for_stmt): Add vec_gather_load and
> > 	vec_scatter_store
> > 	* tree-vect-stmts.c (record_stmt_cost): Make difference between normal
> > 	and scatter/gather ops.
> > 
> > 	* aarch64/aarch64.c (aarch64_builtin_vectorization_cost): Add
> > 	vec_gather_load and vec_scatter_store.
> > 	* arm/arm.c (arm_builtin_vectorization_cost): Likewise.
> > 	* powerpcspe/powerpcspe.c (rs6000_builtin_vectorization_cost): Likewise.
> > 	* rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Likewise.
> > 	* s390/s390.c (s390_builtin_vectorization_cost): Likewise.
> > 	* spu/spu.c (spu_builtin_vectorization_cost): Likewise.
> > 
> > Index: config/aarch64/aarch64.c
> > ===================================================================
> > --- config/aarch64/aarch64.c	(revision 253789)
> > +++ config/aarch64/aarch64.c	(working copy)
> > @@ -8547,9 +8547,10 @@ aarch64_builtin_vectorization_cost (enum
> >  	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
> >  
> >        case vector_load:
> > +      case vector_gather_load:
> >  	return costs->vec_align_load_cost;
> >  
> > -      case vector_store:
> > +      case vector_scatter_store:
> >  	return costs->vec_store_cost;
> >  
> >        case vec_to_scalar:
> > Index: config/arm/arm.c
> > ===================================================================
> > --- config/arm/arm.c	(revision 253789)
> > +++ config/arm/arm.c	(working copy)
> > @@ -11241,9 +11241,11 @@ arm_builtin_vectorization_cost (enum vec
> >          return current_tune->vec_costs->vec_stmt_cost;
> >  
> >        case vector_load:
> > +      case vector_gather_load:
> >          return current_tune->vec_costs->vec_align_load_cost;
> >  
> >        case vector_store:
> > +      case vector_scatter_store:
> >          return current_tune->vec_costs->vec_store_cost;
> >  
> >        case vec_to_scalar:
> > Index: config/powerpcspe/powerpcspe.c
> > ===================================================================
> > --- config/powerpcspe/powerpcspe.c	(revision 253789)
> > +++ config/powerpcspe/powerpcspe.c	(working copy)
> > @@ -5834,6 +5834,8 @@ rs6000_builtin_vectorization_cost (enum
> >        case vector_stmt:
> >        case vector_load:
> >        case vector_store:
> > +      case vector_gather_load:
> > +      case vector_scatter_store:
> >        case vec_to_scalar:
> >        case scalar_to_vec:
> >        case cond_branch_not_taken:
> > Index: config/rs6000/rs6000.c
> > ===================================================================
> > --- config/rs6000/rs6000.c	(revision 253789)
> > +++ config/rs6000/rs6000.c	(working copy)
> > @@ -5398,6 +5398,8 @@ rs6000_builtin_vectorization_cost (enum
> >        case vector_stmt:
> >        case vector_load:
> >        case vector_store:
> > +      case vector_gather_load:
> > +      case vector_scatter_store:
> >        case vec_to_scalar:
> >        case scalar_to_vec:
> >        case cond_branch_not_taken:
> > Index: config/s390/s390.c
> > ===================================================================
> > --- config/s390/s390.c	(revision 253789)
> > +++ config/s390/s390.c	(working copy)
> > @@ -3717,6 +3717,8 @@ s390_builtin_vectorization_cost (enum ve
> >        case vector_stmt:
> >        case vector_load:
> >        case vector_store:
> > +      case vector_gather_load:
> > +      case vector_scatter_store:
> >        case vec_to_scalar:
> >        case scalar_to_vec:
> >        case cond_branch_not_taken:
> > Index: config/spu/spu.c
> > ===================================================================
> > --- config/spu/spu.c	(revision 253789)
> > +++ config/spu/spu.c	(working copy)
> > @@ -6625,6 +6625,8 @@ spu_builtin_vectorization_cost (enum vec
> >        case vector_stmt:
> >        case vector_load:
> >        case vector_store:
> > +      case vector_gather_load:
> > +      case vector_scatter_store:
> >        case vec_to_scalar:
> >        case scalar_to_vec:
> >        case cond_branch_not_taken:
> > Index: target.h
> > ===================================================================
> > --- target.h	(revision 253789)
> > +++ target.h	(working copy)
> > @@ -171,9 +171,11 @@ enum vect_cost_for_stmt
> >    scalar_store,
> >    vector_stmt,
> >    vector_load,
> > +  vector_gather_load,
> >    unaligned_load,
> >    unaligned_store,
> >    vector_store,
> > +  vector_scatter_store,
> >    vec_to_scalar,
> >    scalar_to_vec,
> >    cond_branch_not_taken,
> > Index: tree-vect-stmts.c
> > ===================================================================
> > --- tree-vect-stmts.c	(revision 253789)
> > +++ tree-vect-stmts.c	(working copy)
> > @@ -95,6 +95,12 @@ record_stmt_cost (stmt_vector_for_cost *
> >  		  enum vect_cost_for_stmt kind, stmt_vec_info stmt_info,
> >  		  int misalign, enum vect_cost_model_location where)
> >  {
> > +  if ((kind == vector_load || kind == unaligned_load)
> > +      && STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> > +    kind = vector_gather_load;
> > +  if ((kind == vector_store || kind == unaligned_store)
> > +      && STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> > +    kind = vector_scatter_store;
> >    if (body_cost_vec)
> >      {
> >        tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
> > 
> > 
> 
> -- 
> Richard Biener <rguenther@suse.de>
> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
Richard Biener Oct. 18, 2017, 7:35 a.m. UTC | #3
On Tue, 17 Oct 2017, Jan Hubicka wrote:

> > On Tue, 17 Oct 2017, Jan Hubicka wrote:
> > 
> > > Hi,
> > > gether/scatter loads tends to be expensive (at least for x86) while we now account them
> > > as vector loads/stores which are cheap.  This patch adds vectorizer cost entry for these
> > > so this can be modelled more realistically.
> > > 
> > > Bootstrapped/regtested x86_64-linux, OK?
> > 
> > Ok.  gather and load is somewhat redundant, likewise
> > scatter and store.  So you might want to change it to just
> > vector_gather and vector_scatter.  Even vector_ is redundant...
> 
> Hehe, comming from outside of vectorizer world, I did not know what
> scatter/gather is and thus I wanted to keep load/store and vec in so it will be
> easier to google for those who will need to fill in the numbers in future :)
> > 
> > Best available implementations manage to hide the vector build
> > cost and just expose the latency of the load(s).  I wonder what
> > Zen does here ;)
> 
> According to Agner's tables, gathers range from 12 ops (vgatherdpd)
> to 66 ops (vpgatherdd).  I assume that CPU needs to do following:
> 
> 1) transfer the offsets sse->ALU unit for address generation (3 cycles
>    each, 2 ops)
> 2) do the address calcualtion (2 ops, probably 4 ops because it does not map naturally
> 			       to AGU)
> 2) do the load (7 cycles each, 2 ops)
> 3) merge results (1 ops)
> 
> so I get 7 ops, not sure what remaining 5 do.
> 
> Agner does not account time, but According to
> http://users.atw.hu/instlatx64/AuthenticAMD0800F11_K17_Zen_InstLatX64.txt the
> gather time ranges from 14 cycles (vgatherpd) to 20 cycles.  Here I guess it is
> 3+1+7+1=12 so it seems to work.
> 
> If you implement gather by hand, you save the SSE->address caluclation path and
> thus you can get faster.

I see.  It looks to me Zen should disable gather/scatter then completely
and we should implement manual gather/scatter code-generation in the
vectorizer (or lower it in vector lowering).  It sounds like they
only implemented it to have "complete" AVX2 support (ISTR scatter
is only in AVX512f).

> > Note the most major source of impreciseness in the cost model
> > is from vec_perm because we lack the information of the
> > permutation mask which means we can't distinguish between
> > cross-lane and intra-lane permutes.
> 
> Besides that we lack information about what operation we do (addition
> or division?) which may be useful to pass down, especially because we do
> have relevant information handy in the x86_cost tables.  So I am thinking
> of adding extra parameter to the hook telling the operation.

Not sure.  The costs are all supposed to be relative to scalar cost
and I fear we get nearer to a GIGO syndrome when adding more information
here ;)

> What info we need to pass for permutations?

The full constant permutation vector ...

Note that I think this particular cost hook isn't the best one.  We've
added TARGET_VECTORIZE_ADD_STMT_COST and friends to be the more
"powerful" ones but unfortunately the vectorizer itself doesn't
always use that and it's somewhat too powerful and at the same time
the vectorizer doesn't provide all information in a convenient way
through the passed stmt_info but the hook would have to reverse-engineer
things (like a permutation mask).  At least the operation code is
readily available here.

I think it only needs some minor refactoring to make the vectorizer
always use the "proper" hook though.

Richard.

> Honza
> > 
> > Richard.
> > 
> > > Honza
> > > 
> > > 2017-10-17  Jan Hubicka  <hubicka@ucw.cz>
> > > 
> > > 	* target.h (enum vect_cost_for_stmt): Add vec_gather_load and
> > > 	vec_scatter_store
> > > 	* tree-vect-stmts.c (record_stmt_cost): Make difference between normal
> > > 	and scatter/gather ops.
> > > 
> > > 	* aarch64/aarch64.c (aarch64_builtin_vectorization_cost): Add
> > > 	vec_gather_load and vec_scatter_store.
> > > 	* arm/arm.c (arm_builtin_vectorization_cost): Likewise.
> > > 	* powerpcspe/powerpcspe.c (rs6000_builtin_vectorization_cost): Likewise.
> > > 	* rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Likewise.
> > > 	* s390/s390.c (s390_builtin_vectorization_cost): Likewise.
> > > 	* spu/spu.c (spu_builtin_vectorization_cost): Likewise.
> > > 
> > > Index: config/aarch64/aarch64.c
> > > ===================================================================
> > > --- config/aarch64/aarch64.c	(revision 253789)
> > > +++ config/aarch64/aarch64.c	(working copy)
> > > @@ -8547,9 +8547,10 @@ aarch64_builtin_vectorization_cost (enum
> > >  	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
> > >  
> > >        case vector_load:
> > > +      case vector_gather_load:
> > >  	return costs->vec_align_load_cost;
> > >  
> > > -      case vector_store:
> > > +      case vector_scatter_store:
> > >  	return costs->vec_store_cost;
> > >  
> > >        case vec_to_scalar:
> > > Index: config/arm/arm.c
> > > ===================================================================
> > > --- config/arm/arm.c	(revision 253789)
> > > +++ config/arm/arm.c	(working copy)
> > > @@ -11241,9 +11241,11 @@ arm_builtin_vectorization_cost (enum vec
> > >          return current_tune->vec_costs->vec_stmt_cost;
> > >  
> > >        case vector_load:
> > > +      case vector_gather_load:
> > >          return current_tune->vec_costs->vec_align_load_cost;
> > >  
> > >        case vector_store:
> > > +      case vector_scatter_store:
> > >          return current_tune->vec_costs->vec_store_cost;
> > >  
> > >        case vec_to_scalar:
> > > Index: config/powerpcspe/powerpcspe.c
> > > ===================================================================
> > > --- config/powerpcspe/powerpcspe.c	(revision 253789)
> > > +++ config/powerpcspe/powerpcspe.c	(working copy)
> > > @@ -5834,6 +5834,8 @@ rs6000_builtin_vectorization_cost (enum
> > >        case vector_stmt:
> > >        case vector_load:
> > >        case vector_store:
> > > +      case vector_gather_load:
> > > +      case vector_scatter_store:
> > >        case vec_to_scalar:
> > >        case scalar_to_vec:
> > >        case cond_branch_not_taken:
> > > Index: config/rs6000/rs6000.c
> > > ===================================================================
> > > --- config/rs6000/rs6000.c	(revision 253789)
> > > +++ config/rs6000/rs6000.c	(working copy)
> > > @@ -5398,6 +5398,8 @@ rs6000_builtin_vectorization_cost (enum
> > >        case vector_stmt:
> > >        case vector_load:
> > >        case vector_store:
> > > +      case vector_gather_load:
> > > +      case vector_scatter_store:
> > >        case vec_to_scalar:
> > >        case scalar_to_vec:
> > >        case cond_branch_not_taken:
> > > Index: config/s390/s390.c
> > > ===================================================================
> > > --- config/s390/s390.c	(revision 253789)
> > > +++ config/s390/s390.c	(working copy)
> > > @@ -3717,6 +3717,8 @@ s390_builtin_vectorization_cost (enum ve
> > >        case vector_stmt:
> > >        case vector_load:
> > >        case vector_store:
> > > +      case vector_gather_load:
> > > +      case vector_scatter_store:
> > >        case vec_to_scalar:
> > >        case scalar_to_vec:
> > >        case cond_branch_not_taken:
> > > Index: config/spu/spu.c
> > > ===================================================================
> > > --- config/spu/spu.c	(revision 253789)
> > > +++ config/spu/spu.c	(working copy)
> > > @@ -6625,6 +6625,8 @@ spu_builtin_vectorization_cost (enum vec
> > >        case vector_stmt:
> > >        case vector_load:
> > >        case vector_store:
> > > +      case vector_gather_load:
> > > +      case vector_scatter_store:
> > >        case vec_to_scalar:
> > >        case scalar_to_vec:
> > >        case cond_branch_not_taken:
> > > Index: target.h
> > > ===================================================================
> > > --- target.h	(revision 253789)
> > > +++ target.h	(working copy)
> > > @@ -171,9 +171,11 @@ enum vect_cost_for_stmt
> > >    scalar_store,
> > >    vector_stmt,
> > >    vector_load,
> > > +  vector_gather_load,
> > >    unaligned_load,
> > >    unaligned_store,
> > >    vector_store,
> > > +  vector_scatter_store,
> > >    vec_to_scalar,
> > >    scalar_to_vec,
> > >    cond_branch_not_taken,
> > > Index: tree-vect-stmts.c
> > > ===================================================================
> > > --- tree-vect-stmts.c	(revision 253789)
> > > +++ tree-vect-stmts.c	(working copy)
> > > @@ -95,6 +95,12 @@ record_stmt_cost (stmt_vector_for_cost *
> > >  		  enum vect_cost_for_stmt kind, stmt_vec_info stmt_info,
> > >  		  int misalign, enum vect_cost_model_location where)
> > >  {
> > > +  if ((kind == vector_load || kind == unaligned_load)
> > > +      && STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> > > +    kind = vector_gather_load;
> > > +  if ((kind == vector_store || kind == unaligned_store)
> > > +      && STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> > > +    kind = vector_scatter_store;
> > >    if (body_cost_vec)
> > >      {
> > >        tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
> > > 
> > > 
> > 
> > -- 
> > Richard Biener <rguenther@suse.de>
> > SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
> 
>
Jan Hubicka Oct. 18, 2017, 12:28 p.m. UTC | #4
> > According to Agner's tables, gathers range from 12 ops (vgatherdpd)
> > to 66 ops (vpgatherdd).  I assume that CPU needs to do following:
> > 
> > 1) transfer the offsets sse->ALU unit for address generation (3 cycles
> >    each, 2 ops)
> > 2) do the address calcualtion (2 ops, probably 4 ops because it does not map naturally
> > 			       to AGU)
> > 2) do the load (7 cycles each, 2 ops)
> > 3) merge results (1 ops)
> > 
> > so I get 7 ops, not sure what remaining 5 do.
> > 
> > Agner does not account time, but According to
> > http://users.atw.hu/instlatx64/AuthenticAMD0800F11_K17_Zen_InstLatX64.txt the
> > gather time ranges from 14 cycles (vgatherpd) to 20 cycles.  Here I guess it is
> > 3+1+7+1=12 so it seems to work.
> > 
> > If you implement gather by hand, you save the SSE->address caluclation path and
> > thus you can get faster.
> 
> I see.  It looks to me Zen should disable gather/scatter then completely
> and we should implement manual gather/scatter code-generation in the
> vectorizer (or lower it in vector lowering).  It sounds like they
> only implemented it to have "complete" AVX2 support (ISTR scatter
> is only in AVX512f).

Those instructions seems similarly expensive in Intel implementation.
http://users.atw.hu/instlatx64/GenuineIntel0050654_SkylakeXeon9_InstLatX64.txt
lists latencies ranging from 18 to 32 cycles.

Of course it may also be the case that the utility is measuring gathers incorrectly.
according to Agner's table Skylake has optimized gathers, they used to be
12 to 34 uops on haswell and are no 4 to 5.
> 
> > > Note the most major source of impreciseness in the cost model
> > > is from vec_perm because we lack the information of the
> > > permutation mask which means we can't distinguish between
> > > cross-lane and intra-lane permutes.
> > 
> > Besides that we lack information about what operation we do (addition
> > or division?) which may be useful to pass down, especially because we do
> > have relevant information handy in the x86_cost tables.  So I am thinking
> > of adding extra parameter to the hook telling the operation.
> 
> Not sure.  The costs are all supposed to be relative to scalar cost
> and I fear we get nearer to a GIGO syndrome when adding more information
> here ;)

Yep, however there is setup cost (like loads/stores) which comes into game
as well.  I will see how far i can get by making x86 costs more "realistic"

Honza
Richard Biener Oct. 18, 2017, 1:03 p.m. UTC | #5
On Wed, 18 Oct 2017, Jan Hubicka wrote:

> > > According to Agner's tables, gathers range from 12 ops (vgatherdpd)
> > > to 66 ops (vpgatherdd).  I assume that CPU needs to do following:
> > > 
> > > 1) transfer the offsets sse->ALU unit for address generation (3 cycles
> > >    each, 2 ops)
> > > 2) do the address calcualtion (2 ops, probably 4 ops because it does not map naturally
> > > 			       to AGU)
> > > 2) do the load (7 cycles each, 2 ops)
> > > 3) merge results (1 ops)
> > > 
> > > so I get 7 ops, not sure what remaining 5 do.
> > > 
> > > Agner does not account time, but According to
> > > http://users.atw.hu/instlatx64/AuthenticAMD0800F11_K17_Zen_InstLatX64.txt the
> > > gather time ranges from 14 cycles (vgatherpd) to 20 cycles.  Here I guess it is
> > > 3+1+7+1=12 so it seems to work.
> > > 
> > > If you implement gather by hand, you save the SSE->address caluclation path and
> > > thus you can get faster.
> > 
> > I see.  It looks to me Zen should disable gather/scatter then completely
> > and we should implement manual gather/scatter code-generation in the
> > vectorizer (or lower it in vector lowering).  It sounds like they
> > only implemented it to have "complete" AVX2 support (ISTR scatter
> > is only in AVX512f).
> 
> Those instructions seems similarly expensive in Intel implementation.
> http://users.atw.hu/instlatx64/GenuineIntel0050654_SkylakeXeon9_InstLatX64.txt
> lists latencies ranging from 18 to 32 cycles.
> 
> Of course it may also be the case that the utility is measuring gathers incorrectly.
> according to Agner's table Skylake has optimized gathers, they used to be
> 12 to 34 uops on haswell and are no 4 to 5.
> > 
> > > > Note the most major source of impreciseness in the cost model
> > > > is from vec_perm because we lack the information of the
> > > > permutation mask which means we can't distinguish between
> > > > cross-lane and intra-lane permutes.
> > > 
> > > Besides that we lack information about what operation we do (addition
> > > or division?) which may be useful to pass down, especially because we do
> > > have relevant information handy in the x86_cost tables.  So I am thinking
> > > of adding extra parameter to the hook telling the operation.
> > 
> > Not sure.  The costs are all supposed to be relative to scalar cost
> > and I fear we get nearer to a GIGO syndrome when adding more information
> > here ;)
> 
> Yep, however there is setup cost (like loads/stores) which comes into game
> as well.  I will see how far i can get by making x86 costs more "realistic"

I think it should be always counting the cost of n scalar loads plus
an overhead depending on the microarchitecture.  As you say we're
not getting rid of any memory latencies (in the worst case).  From
Agner I read Skylake optimized gathers down to the actual memory
access cost, the overhead is basically well hidden.

Richard.
Jan Hubicka Oct. 18, 2017, 3:07 p.m. UTC | #6
> > Those instructions seems similarly expensive in Intel implementation.
> > http://users.atw.hu/instlatx64/GenuineIntel0050654_SkylakeXeon9_InstLatX64.txt
> > lists latencies ranging from 18 to 32 cycles.
> > 
> > Of course it may also be the case that the utility is measuring gathers incorrectly.
> > according to Agner's table Skylake has optimized gathers, they used to be
> > 12 to 34 uops on haswell and are no 4 to 5.
> > > 
> > > > > Note the most major source of impreciseness in the cost model
> > > > > is from vec_perm because we lack the information of the
> > > > > permutation mask which means we can't distinguish between
> > > > > cross-lane and intra-lane permutes.
> > > > 
> > > > Besides that we lack information about what operation we do (addition
> > > > or division?) which may be useful to pass down, especially because we do
> > > > have relevant information handy in the x86_cost tables.  So I am thinking
> > > > of adding extra parameter to the hook telling the operation.
> > > 
> > > Not sure.  The costs are all supposed to be relative to scalar cost
> > > and I fear we get nearer to a GIGO syndrome when adding more information
> > > here ;)
> > 
> > Yep, however there is setup cost (like loads/stores) which comes into game
> > as well.  I will see how far i can get by making x86 costs more "realistic"
> 
> I think it should be always counting the cost of n scalar loads plus
> an overhead depending on the microarchitecture.  As you say we're
> not getting rid of any memory latencies (in the worst case).  From
> Agner I read Skylake optimized gathers down to the actual memory
> access cost, the overhead is basically well hidden.

Where did you find it? It does not seem to quite match the instruction latency table
above.

Honza
> 
> Richard.
> 
> -- 
> Richard Biener <rguenther@suse.de>
> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)
Jan Hubicka Oct. 19, 2017, 7:17 a.m. UTC | #7
Hi,
this is proof of concept patch for vectorizer costs to use costs used for rtx_cost
and register_move_cost which are readily available in ix86_costs instead of using
its own set of random values.  At least until we have proof of evidence that vectroizer
costs needs to differ, I do not think we want to complicate CPU tuning by having them
twice.

This is of course quite intrusive change to what we have becuase it affects all
x86 targets.  I have finally worked out that the "random" values used by AMD target
corresponds to latencies of bdver1.

I have benchmarked them on Zen and also temporarily patches Czerny (Haswel).
It seems to cause no regression and quite nice improvements:
  - 27.3% for facerec on Zen
  - 7% for mgrid on Haswel
  - maybe 1% for galgel of Haswell
  - 3% for facerec on Haswell
  - maybe 1% aspi on Haswell
  - there may be small off-noise improvement for rnflow and regression for fatigue2 on Haswell

So I would say that outcome is surprisingly good (especially due to lack of
noteworthy regressions).  I also know that vectorizer hurts performance on Zen and
Mesa/tonto benchmarks which is not cured by this patch alone.

There is testsuite fallout though.

./testsuite/g++/g++.sum:FAIL: g++.dg/vect/slp-pr56812.cc  -std=c++11  scan-tree-dump-times slp1 "basic block vectorized" 1 (found 0 times)
./testsuite/g++/g++.sum:FAIL: g++.dg/vect/slp-pr56812.cc  -std=c++14  scan-tree-dump-times slp1 "basic block vectorized" 1 (found 0 times)
./testsuite/g++/g++.sum:FAIL: g++.dg/vect/slp-pr56812.cc  -std=c++98  scan-tree-dump-times slp1 "basic block vectorized" 1 (found 0 times)

  Here we vectorize the loop before first while originally we unrolled and SLP vectorized next

./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_double_1.c scan-assembler-times vfmadd[123]+sd 56 (found 32 times)
./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_double_2.c scan-assembler-times vfmadd[123]+sd 56 (found 32 times)
./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_double_3.c scan-assembler-times vfmadd[123]+sd 56 (found 32 times)
./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_double_4.c scan-assembler-times vfmadd[123]+sd 56 (found 32 times)
./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_double_5.c scan-assembler-times vfmadd[123]+sd 56 (found 32 times)
./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_double_6.c scan-assembler-times vfmadd[123]+sd 56 (found 32 times)
./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_float_1.c scan-assembler-times vfmadd[123]+ss 120 (found 64 times)
./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_float_2.c scan-assembler-times vfmadd[123]+ss 120 (found 64 times)
./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_float_3.c scan-assembler-times vfmadd[123]+ss 120 (found 64 times)
./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_float_4.c scan-assembler-times vfmadd[123]+ss 120 (found 64 times)
./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_float_5.c scan-assembler-times vfmadd[123]+ss 120 (found 64 times)
./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_float_6.c scan-assembler-times vfnmsub[123]+ss 120 (found 64 times)

And friends, clearly we do not vectorize all loops, I did not look into details yet

./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/pr61403.c scan-assembler blend

Here again we vectorize loop while originally we did SLP.  I am not sure why loop
vectorizer does not use blend.

./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/pr79683.c scan-assembler-times padd 1 (found 0 times)

Here we are supposed to vectorize two integer additions, but since generic cost model now claims that
latency of vector add is twice of integer add we don't.  I think it makes sense.

./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/pr79723.c scan-assembler mov[au]p.[ \t][^,]+, %gs:

Similarly here.

If it seems to make sense, I will clean it up (remove now unused entries and scale
conditional costs by COSTS_N_INSNS) and fix the tessuite fallout.

Honza

Index: i386.c
===================================================================
--- i386.c	(revision 253824)
+++ i386.c	(working copy)
@@ -44015,50 +44015,56 @@ static int
 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
                                  tree vectype, int)
 {
+  bool fp = false;
+  if (vectype != NULL)
+    fp = FLOAT_TYPE_P (vectype);
+
   switch (type_of_cost)
     {
       case scalar_stmt:
-        return ix86_cost->scalar_stmt_cost;
+        return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
 
       case scalar_load:
-        return ix86_cost->scalar_load_cost;
+        return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
+			      : ix86_cost->int_load [2]) / 2;
 
       case scalar_store:
-        return ix86_cost->scalar_store_cost;
+        return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
+			      : ix86_cost->int_store [2]) / 2;
 
       case vector_stmt:
-        return ix86_cost->vec_stmt_cost;
+        return fp ? ix86_cost->addss : ix86_cost->sse_op;
 
       case vector_load:
-        return ix86_cost->vec_align_load_cost;
+        return COSTS_N_INSNS (ix86_cost->sse_load[2]) / 2;
 
       case vector_store:
-        return ix86_cost->vec_store_cost;
+        return COSTS_N_INSNS (ix86_cost->sse_store[2]) / 2;
 
       case vec_to_scalar:
-        return ix86_cost->vec_to_scalar_cost;
-
       case scalar_to_vec:
-        return ix86_cost->scalar_to_vec_cost;
+        return ix86_cost->sse_op;
 
       case unaligned_load:
-      case unaligned_store:
       case vector_gather_load:
+        return COSTS_N_INSNS (ix86_cost->sse_load[2]) / 2;
+
+      case unaligned_store:
       case vector_scatter_store:
-        return ix86_cost->vec_unalign_load_cost;
+        return COSTS_N_INSNS (ix86_cost->sse_store[2]) / 2;
 
       case cond_branch_taken:
-        return ix86_cost->cond_taken_branch_cost;
+        return COSTS_N_INSNS (ix86_cost->cond_taken_branch_cost);
 
       case cond_branch_not_taken:
-        return ix86_cost->cond_not_taken_branch_cost;
+        return COSTS_N_INSNS (ix86_cost->cond_not_taken_branch_cost);
 
       case vec_perm:
       case vec_promote_demote:
-        return ix86_cost->vec_stmt_cost;
+        return ix86_cost->sse_op;
 
       case vec_construct:
-	return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
+	return ix86_cost->sse_op * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 
       default:
         gcc_unreachable ();
Richard Biener Oct. 19, 2017, 8:05 a.m. UTC | #8
On Thu, 19 Oct 2017, Jan Hubicka wrote:

> Hi,
> this is proof of concept patch for vectorizer costs to use costs used for rtx_cost
> and register_move_cost which are readily available in ix86_costs instead of using
> its own set of random values.  At least until we have proof of evidence that vectroizer
> costs needs to differ, I do not think we want to complicate CPU tuning by having them
> twice.
> 
> This is of course quite intrusive change to what we have becuase it affects all
> x86 targets.  I have finally worked out that the "random" values used by AMD target
> corresponds to latencies of bdver1.
> 
> I have benchmarked them on Zen and also temporarily patches Czerny (Haswel).
> It seems to cause no regression and quite nice improvements:
>   - 27.3% for facerec on Zen
>   - 7% for mgrid on Haswel
>   - maybe 1% for galgel of Haswell
>   - 3% for facerec on Haswell
>   - maybe 1% aspi on Haswell
>   - there may be small off-noise improvement for rnflow and regression for fatigue2 on Haswell
> 
> So I would say that outcome is surprisingly good (especially due to lack of
> noteworthy regressions).  I also know that vectorizer hurts performance on Zen and
> Mesa/tonto benchmarks which is not cured by this patch alone.
> 
> There is testsuite fallout though.
> 
> ./testsuite/g++/g++.sum:FAIL: g++.dg/vect/slp-pr56812.cc  -std=c++11  scan-tree-dump-times slp1 "basic block vectorized" 1 (found 0 times)
> ./testsuite/g++/g++.sum:FAIL: g++.dg/vect/slp-pr56812.cc  -std=c++14  scan-tree-dump-times slp1 "basic block vectorized" 1 (found 0 times)
> ./testsuite/g++/g++.sum:FAIL: g++.dg/vect/slp-pr56812.cc  -std=c++98  scan-tree-dump-times slp1 "basic block vectorized" 1 (found 0 times)
> 
>   Here we vectorize the loop before first while originally we unrolled and SLP vectorized next
> 
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_double_1.c scan-assembler-times vfmadd[123]+sd 56 (found 32 times)
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_double_2.c scan-assembler-times vfmadd[123]+sd 56 (found 32 times)
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_double_3.c scan-assembler-times vfmadd[123]+sd 56 (found 32 times)
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_double_4.c scan-assembler-times vfmadd[123]+sd 56 (found 32 times)
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_double_5.c scan-assembler-times vfmadd[123]+sd 56 (found 32 times)
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_double_6.c scan-assembler-times vfmadd[123]+sd 56 (found 32 times)
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_float_1.c scan-assembler-times vfmadd[123]+ss 120 (found 64 times)
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_float_2.c scan-assembler-times vfmadd[123]+ss 120 (found 64 times)
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_float_3.c scan-assembler-times vfmadd[123]+ss 120 (found 64 times)
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_float_4.c scan-assembler-times vfmadd[123]+ss 120 (found 64 times)
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_float_5.c scan-assembler-times vfmadd[123]+ss 120 (found 64 times)
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/l_fma_float_6.c scan-assembler-times vfnmsub[123]+ss 120 (found 64 times)
> 
> And friends, clearly we do not vectorize all loops, I did not look into details yet
> 
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/pr61403.c scan-assembler blend
> 
> Here again we vectorize loop while originally we did SLP.  I am not sure why loop
> vectorizer does not use blend.
> 
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/pr79683.c scan-assembler-times padd 1 (found 0 times)
> 
> Here we are supposed to vectorize two integer additions, but since generic cost model now claims that
> latency of vector add is twice of integer add we don't.  I think it makes sense.
> 
> ./testsuite/gcc/gcc.sum:FAIL: gcc.target/i386/pr79723.c scan-assembler mov[au]p.[ \t][^,]+, %gs:
> 
> Similarly here.
> 
> If it seems to make sense, I will clean it up (remove now unused entries and scale
> conditional costs by COSTS_N_INSNS) and fix the tessuite fallout.

Please look at the testsuite fallout in detail.  Note that only
testcases that do not disable the cost model should be affected
(all vect.exp testcases disable the cost model for example).

The patch itself looks mostly good, I suppose if we also have
separate costs for float vs. double you could do a bit better
by looking at the type in more detail.  I think vectype should
be non-NULL most of the time - but for example for the scalar cost
it might not be always there, likewise for SLP vectorization the
scalar cost calls will not have the type information so you'll
get a mixup between integer and FP costing scalar vs. vector.
Nothing that cannot be fixed on the vectorizer side, but ...
(we'd document that for scalar costs we pass the scalar type
for example).

Richard.

> Honza
> 
> Index: i386.c
> ===================================================================
> --- i386.c	(revision 253824)
> +++ i386.c	(working copy)
> @@ -44015,50 +44015,56 @@ static int
>  ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
>                                   tree vectype, int)
>  {
> +  bool fp = false;
> +  if (vectype != NULL)
> +    fp = FLOAT_TYPE_P (vectype);
> +
>    switch (type_of_cost)
>      {
>        case scalar_stmt:
> -        return ix86_cost->scalar_stmt_cost;
> +        return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
>  
>        case scalar_load:
> -        return ix86_cost->scalar_load_cost;
> +        return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
> +			      : ix86_cost->int_load [2]) / 2;
>  
>        case scalar_store:
> -        return ix86_cost->scalar_store_cost;
> +        return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
> +			      : ix86_cost->int_store [2]) / 2;
>  
>        case vector_stmt:
> -        return ix86_cost->vec_stmt_cost;
> +        return fp ? ix86_cost->addss : ix86_cost->sse_op;
>  
>        case vector_load:
> -        return ix86_cost->vec_align_load_cost;
> +        return COSTS_N_INSNS (ix86_cost->sse_load[2]) / 2;
>  
>        case vector_store:
> -        return ix86_cost->vec_store_cost;
> +        return COSTS_N_INSNS (ix86_cost->sse_store[2]) / 2;
>  
>        case vec_to_scalar:
> -        return ix86_cost->vec_to_scalar_cost;
> -
>        case scalar_to_vec:
> -        return ix86_cost->scalar_to_vec_cost;
> +        return ix86_cost->sse_op;
>  
>        case unaligned_load:
> -      case unaligned_store:
>        case vector_gather_load:
> +        return COSTS_N_INSNS (ix86_cost->sse_load[2]) / 2;
> +
> +      case unaligned_store:
>        case vector_scatter_store:
> -        return ix86_cost->vec_unalign_load_cost;
> +        return COSTS_N_INSNS (ix86_cost->sse_store[2]) / 2;
>  
>        case cond_branch_taken:
> -        return ix86_cost->cond_taken_branch_cost;
> +        return COSTS_N_INSNS (ix86_cost->cond_taken_branch_cost);
>  
>        case cond_branch_not_taken:
> -        return ix86_cost->cond_not_taken_branch_cost;
> +        return COSTS_N_INSNS (ix86_cost->cond_not_taken_branch_cost);
>  
>        case vec_perm:
>        case vec_promote_demote:
> -        return ix86_cost->vec_stmt_cost;
> +        return ix86_cost->sse_op;
>  
>        case vec_construct:
> -	return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
> +	return ix86_cost->sse_op * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
>  
>        default:
>          gcc_unreachable ();
> 
>
Jan Hubicka Oct. 21, 2017, 11:58 a.m. UTC | #9
> Please look at the testsuite fallout in detail.  Note that only
> testcases that do not disable the cost model should be affected
> (all vect.exp testcases disable the cost model for example).
> 
> The patch itself looks mostly good, I suppose if we also have
> separate costs for float vs. double you could do a bit better
> by looking at the type in more detail.  I think vectype should
> be non-NULL most of the time - but for example for the scalar cost
> it might not be always there, likewise for SLP vectorization the
> scalar cost calls will not have the type information so you'll
> get a mixup between integer and FP costing scalar vs. vector.
> Nothing that cannot be fixed on the vectorizer side, but ...
> (we'd document that for scalar costs we pass the scalar type
> for example).
> 
> Richard.

Hi,
this is polished patch I have comitted.  It now affects only one testcase.
Most of them was affected by fact that generic and core move costs tables
was set in a way that SSE loads/stores was more expensive then integer
counterparts (making vectorization seem unprofitable) which I have fixed
yesterday.

This seems to have been cut&paste from pentiu4 cost table. Tonight core2 runs
seems to be happy with this change.

FMA testcase failed to count FMA instructions because we stopped peeling for
alignment. This is because I made unaligned/aligned load/stores to be same
cost.   This seems to be the case for core/buldorzer/zen but not for earlier
chips, so we probably want separate table.  I will do that incrementally (and
also the scatter/gather instructions) so we can track effect of individual
changes on the benchmarking bots.

Bootstrapped/regtested x86_64-linux, comitted.
Honza

	* gcc.target/i386/pr79683.c: Disable costmodel.
	* i386.c (ix86_builtin_vectorization_cost): Use existing rtx_cost
	latencies instead of having separate table; make difference between
	integer and float costs.
	* i386.h (processor_costs): Remove scalar_stmt_cost,
	scalar_load_cost, scalar_store_cost, vec_stmt_cost, vec_to_scalar_cost,
	scalar_to_vec_cost, vec_align_load_cost, vec_unalign_load_cost,
	vec_store_cost.
	* x86-tune-costs.h: Remove entries which has been removed in
	procesor_costs from all tables; make cond_taken_branch_cost
	and cond_not_taken_branch_cost COST_N_INSNS based.
Index: testsuite/gcc.target/i386/pr79683.c
===================================================================
--- testsuite/gcc.target/i386/pr79683.c	(revision 253957)
+++ testsuite/gcc.target/i386/pr79683.c	(working copy)
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O3 -msse2" } */
+/* { dg-options "-O3 -msse2 -fvect-cost-model=unlimited" } */
 
 struct s {
     __INT64_TYPE__ a;
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 253957)
+++ config/i386/i386.c	(working copy)
@@ -44051,37 +44051,61 @@ static int
 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
                                  tree vectype, int)
 {
+  bool fp = false;
+  machine_mode mode = TImode;
+  if (vectype != NULL)
+    {
+      fp = FLOAT_TYPE_P (vectype);
+      mode = TYPE_MODE (vectype);
+    }
+
   switch (type_of_cost)
     {
       case scalar_stmt:
-        return ix86_cost->scalar_stmt_cost;
+        return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
 
       case scalar_load:
-        return ix86_cost->scalar_load_cost;
+	/* load/store costs are relative to register move which is 2. Recompute
+ 	   it to COSTS_N_INSNS so everything have same base.  */
+        return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
+			      : ix86_cost->int_load [2]) / 2;
 
       case scalar_store:
-        return ix86_cost->scalar_store_cost;
+        return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
+			      : ix86_cost->int_store [2]) / 2;
 
       case vector_stmt:
-        return ix86_cost->vec_stmt_cost;
+        return ix86_vec_cost (mode,
+			      fp ? ix86_cost->addss : ix86_cost->sse_op,
+			      true);
 
       case vector_load:
-        return ix86_cost->vec_align_load_cost;
+        return ix86_vec_cost (mode,
+			      COSTS_N_INSNS (ix86_cost->sse_load[2]) / 2,
+			      true);
 
       case vector_store:
-        return ix86_cost->vec_store_cost;
+        return ix86_vec_cost (mode,
+			      COSTS_N_INSNS (ix86_cost->sse_store[2]) / 2,
+			      true);
 
       case vec_to_scalar:
-        return ix86_cost->vec_to_scalar_cost;
-
       case scalar_to_vec:
-        return ix86_cost->scalar_to_vec_cost;
+        return ix86_vec_cost (mode, ix86_cost->sse_op, true);
 
+      /* We should have separate costs for unaligned loads and gather/scatter.
+	 Do that incrementally.  */
       case unaligned_load:
-      case unaligned_store:
       case vector_gather_load:
+        return ix86_vec_cost (mode,
+			      COSTS_N_INSNS (ix86_cost->sse_load[2]),
+			      true);
+
+      case unaligned_store:
       case vector_scatter_store:
-        return ix86_cost->vec_unalign_load_cost;
+        return ix86_vec_cost (mode,
+			      COSTS_N_INSNS (ix86_cost->sse_store[2]),
+			      true);
 
       case cond_branch_taken:
         return ix86_cost->cond_taken_branch_cost;
@@ -44091,10 +44115,11 @@ ix86_builtin_vectorization_cost (enum ve
 
       case vec_perm:
       case vec_promote_demote:
-        return ix86_cost->vec_stmt_cost;
+        return ix86_vec_cost (mode,
+			      ix86_cost->sse_op, true);
 
       case vec_construct:
-	return ix86_cost->vec_stmt_cost * (TYPE_VECTOR_SUBPARTS (vectype) - 1);
+	return ix86_vec_cost (mode, ix86_cost->sse_op, false);
 
       default:
         gcc_unreachable ();
Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h	(revision 253957)
+++ config/i386/i386.h	(working copy)
@@ -277,18 +277,6 @@ struct processor_costs {
 				   parallel.  See also
 				   ix86_reassociation_width.  */
   struct stringop_algs *memcpy, *memset;
-  const int scalar_stmt_cost;   /* Cost of any scalar operation, excluding
-				   load and store.  */
-  const int scalar_load_cost;   /* Cost of scalar load.  */
-  const int scalar_store_cost;  /* Cost of scalar store.  */
-  const int vec_stmt_cost;      /* Cost of any vector operation, excluding
-                                   load, store, vector-to-scalar and
-                                   scalar-to-vector operation.  */
-  const int vec_to_scalar_cost;    /* Cost of vect-to-scalar operation.  */
-  const int scalar_to_vec_cost;    /* Cost of scalar-to-vector operation.  */
-  const int vec_align_load_cost;   /* Cost of aligned vector load.  */
-  const int vec_unalign_load_cost; /* Cost of unaligned vector load.  */
-  const int vec_store_cost;        /* Cost of vector store.  */
   const int cond_taken_branch_cost;    /* Cost of taken branch for vectorizer
 					  cost model.  */
   const int cond_not_taken_branch_cost;/* Cost of not taken branch for
Index: config/i386/x86-tune-costs.h
===================================================================
--- config/i386/x86-tune-costs.h	(revision 253958)
+++ config/i386/x86-tune-costs.h	(working copy)
@@ -79,17 +79,8 @@ struct processor_costs ix86_size_cost =
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   ix86_size_memcpy,
   ix86_size_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  1,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  1,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_BYTES (1),			/* cond_taken_branch_cost.  */
+  COSTS_N_BYTES (1),			/* cond_not_taken_branch_cost.  */
 };
 
 /* Processor costs (relative to an add) */
@@ -167,17 +158,8 @@ struct processor_costs i386_cost = {	/*
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   i386_memcpy,
   i386_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 static stringop_algs i486_memcpy[2] = {
@@ -256,17 +238,8 @@ struct processor_costs i486_cost = {	/*
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   i486_memcpy,
   i486_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 static stringop_algs pentium_memcpy[2] = {
@@ -343,17 +316,8 @@ struct processor_costs pentium_cost = {
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium_memcpy,
   pentium_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -423,17 +387,8 @@ struct processor_costs lakemont_cost = {
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium_memcpy,
   pentium_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
@@ -518,17 +473,8 @@ struct processor_costs pentiumpro_cost =
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentiumpro_memcpy,
   pentiumpro_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 static stringop_algs geode_memcpy[2] = {
@@ -605,17 +551,8 @@ struct processor_costs geode_cost = {
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   geode_memcpy,
   geode_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 static stringop_algs k6_memcpy[2] = {
@@ -694,17 +631,8 @@ struct processor_costs k6_cost = {
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   k6_memcpy,
   k6_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 /* For some reason, Athlon deals better with REP prefix (relative to loops)
@@ -784,17 +712,8 @@ struct processor_costs athlon_cost = {
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   athlon_memcpy,
   athlon_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 /* K8 has optimized REP instruction for medium sized blocks, but for very
@@ -883,17 +802,8 @@ struct processor_costs k8_cost = {
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   k8_memcpy,
   k8_memset,
-  4,					/* scalar_stmt_cost.  */
-  2,					/* scalar load_cost.  */
-  2,					/* scalar_store_cost.  */
-  5,					/* vec_stmt_cost.  */
-  0,					/* vec_to_scalar_cost.  */
-  2,					/* scalar_to_vec_cost.  */
-  2,					/* vec_align_load_cost.  */
-  3,					/* vec_unalign_load_cost.  */
-  3,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  2,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
 };
 
 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
@@ -989,17 +899,8 @@ struct processor_costs amdfam10_cost = {
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   amdfam10_memcpy,
   amdfam10_memset,
-  4,					/* scalar_stmt_cost.  */
-  2,					/* scalar load_cost.  */
-  2,					/* scalar_store_cost.  */
-  6,					/* vec_stmt_cost.  */
-  0,					/* vec_to_scalar_cost.  */
-  2,					/* scalar_to_vec_cost.  */
-  2,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  2,					/* vec_store_cost.  */
-  2,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 /*  BDVER1 has optimized REP instruction for medium sized blocks, but for
@@ -1097,17 +998,8 @@ const struct processor_costs bdver1_cost
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   bdver1_memcpy,
   bdver1_memset,
-  6,					/* scalar_stmt_cost.  */
-  4,					/* scalar load_cost.  */
-  4,					/* scalar_store_cost.  */
-  6,					/* vec_stmt_cost.  */
-  0,					/* vec_to_scalar_cost.  */
-  2,					/* scalar_to_vec_cost.  */
-  4,					/* vec_align_load_cost.  */
-  4,					/* vec_unalign_load_cost.  */
-  4,					/* vec_store_cost.  */
-  4,					/* cond_taken_branch_cost.  */
-  2,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
 };
 
 /*  BDVER2 has optimized REP instruction for medium sized blocks, but for
@@ -1206,17 +1098,8 @@ const struct processor_costs bdver2_cost
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   bdver2_memcpy,
   bdver2_memset,
-  6,					/* scalar_stmt_cost.  */
-  4,					/* scalar load_cost.  */
-  4,					/* scalar_store_cost.  */
-  6,					/* vec_stmt_cost.  */
-  0,					/* vec_to_scalar_cost.  */
-  2,					/* scalar_to_vec_cost.  */
-  4,					/* vec_align_load_cost.  */
-  4,					/* vec_unalign_load_cost.  */
-  4,					/* vec_store_cost.  */
-  4,					/* cond_taken_branch_cost.  */
-  2,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
 };
 
 
@@ -1306,17 +1189,8 @@ struct processor_costs bdver3_cost = {
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   bdver3_memcpy,
   bdver3_memset,
-  6,					/* scalar_stmt_cost.  */
-  4,					/* scalar load_cost.  */
-  4,					/* scalar_store_cost.  */
-  6,					/* vec_stmt_cost.  */
-  0,					/* vec_to_scalar_cost.  */
-  2,					/* scalar_to_vec_cost.  */
-  4,					/* vec_align_load_cost.  */
-  4,					/* vec_unalign_load_cost.  */
-  4,					/* vec_store_cost.  */
-  4,					/* cond_taken_branch_cost.  */
-  2,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
 };
 
 /*  BDVER4 has optimized REP instruction for medium sized blocks, but for
@@ -1405,17 +1279,8 @@ struct processor_costs bdver4_cost = {
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   bdver4_memcpy,
   bdver4_memset,
-  6,					/* scalar_stmt_cost.  */
-  4,					/* scalar load_cost.  */
-  4,					/* scalar_store_cost.  */
-  6,					/* vec_stmt_cost.  */
-  0,					/* vec_to_scalar_cost.  */
-  2,					/* scalar_to_vec_cost.  */
-  4,					/* vec_align_load_cost.  */
-  4,					/* vec_unalign_load_cost.  */
-  4,					/* vec_store_cost.  */
-  4,					/* cond_taken_branch_cost.  */
-  2,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
 };
 
 
@@ -1524,17 +1389,8 @@ struct processor_costs znver1_cost = {
   4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
   znver1_memcpy,
   znver1_memset,
-  6,					/* scalar_stmt_cost.  */
-  4,					/* scalar load_cost.  */
-  4,					/* scalar_store_cost.  */
-  6,					/* vec_stmt_cost.  */
-  0,					/* vec_to_scalar_cost.  */
-  2,					/* scalar_to_vec_cost.  */
-  4,					/* vec_align_load_cost.  */
-  4,					/* vec_unalign_load_cost.  */
-  4,					/* vec_store_cost.  */
-  4,					/* cond_taken_branch_cost.  */
-  2,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (2),			/* cond_not_taken_branch_cost.  */
 };
 
   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
@@ -1624,17 +1480,8 @@ const struct processor_costs btver1_cost
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   btver1_memcpy,
   btver1_memset,
-  4,					/* scalar_stmt_cost.  */
-  2,					/* scalar load_cost.  */
-  2,					/* scalar_store_cost.  */
-  6,					/* vec_stmt_cost.  */
-  0,					/* vec_to_scalar_cost.  */
-  2,					/* scalar_to_vec_cost.  */
-  2,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  2,					/* vec_store_cost.  */
-  2,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 static stringop_algs btver2_memcpy[2] = {
@@ -1721,17 +1568,8 @@ const struct processor_costs btver2_cost
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   btver2_memcpy,
   btver2_memset,
-  4,					/* scalar_stmt_cost.  */
-  2,					/* scalar load_cost.  */
-  2,					/* scalar_store_cost.  */
-  6,					/* vec_stmt_cost.  */
-  0,					/* vec_to_scalar_cost.  */
-  2,					/* scalar_to_vec_cost.  */
-  2,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  2,					/* vec_store_cost.  */
-  2,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (2),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 static stringop_algs pentium4_memcpy[2] = {
@@ -1809,17 +1647,8 @@ struct processor_costs pentium4_cost = {
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   pentium4_memcpy,
   pentium4_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 static stringop_algs nocona_memcpy[2] = {
@@ -1900,17 +1729,8 @@ struct processor_costs nocona_cost = {
   1, 1, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   nocona_memcpy,
   nocona_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 static stringop_algs atom_memcpy[2] = {
@@ -1989,17 +1809,8 @@ struct processor_costs atom_cost = {
   2, 2, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
   atom_memcpy,
   atom_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 static stringop_algs slm_memcpy[2] = {
@@ -2078,17 +1889,8 @@ struct processor_costs slm_cost = {
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   slm_memcpy,
   slm_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  4,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 static stringop_algs intel_memcpy[2] = {
@@ -2167,17 +1969,8 @@ struct processor_costs intel_cost = {
   1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   intel_memcpy,
   intel_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  4,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 /* Generic should produce code tuned for Core-i7 (and newer chips)
@@ -2265,17 +2058,8 @@ struct processor_costs generic_cost = {
   1, 2, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
   generic_memcpy,
   generic_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
 
 /* core_cost should produce code tuned for Core familly of CPUs.  */
@@ -2366,16 +2150,7 @@ struct processor_costs core_cost = {
   1, 4, 2, 2,				/* reassoc int, fp, vec_int, vec_fp.  */
   core_memcpy,
   core_memset,
-  1,					/* scalar_stmt_cost.  */
-  1,					/* scalar load_cost.  */
-  1,					/* scalar_store_cost.  */
-  1,					/* vec_stmt_cost.  */
-  1,					/* vec_to_scalar_cost.  */
-  1,					/* scalar_to_vec_cost.  */
-  1,					/* vec_align_load_cost.  */
-  2,					/* vec_unalign_load_cost.  */
-  1,					/* vec_store_cost.  */
-  3,					/* cond_taken_branch_cost.  */
-  1,					/* cond_not_taken_branch_cost.  */
+  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
+  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
 };
Toon Moene Oct. 21, 2017, 2:03 p.m. UTC | #10
On 10/17/2017 07:22 PM, Jan Hubicka wrote:

> According to Agner's tables, gathers range from 12 ops (vgatherdpd)
> to 66 ops (vpgatherdd).  I assume that CPU needs to do following:

In our code, it is basically don't" care" how much work it is for a 
gather instruction to do its work.

Without gather the most expensive loop in our code couldn't be 
vectorized (there are only a handful of gather instructions in that loop 
and dozens of other vector instructions).

Kind regards,
diff mbox series

Patch

Index: config/aarch64/aarch64.c
===================================================================
--- config/aarch64/aarch64.c	(revision 253789)
+++ config/aarch64/aarch64.c	(working copy)
@@ -8547,9 +8547,10 @@  aarch64_builtin_vectorization_cost (enum
 	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
 
       case vector_load:
+      case vector_gather_load:
 	return costs->vec_align_load_cost;
 
-      case vector_store:
+      case vector_scatter_store:
 	return costs->vec_store_cost;
 
       case vec_to_scalar:
Index: config/arm/arm.c
===================================================================
--- config/arm/arm.c	(revision 253789)
+++ config/arm/arm.c	(working copy)
@@ -11241,9 +11241,11 @@  arm_builtin_vectorization_cost (enum vec
         return current_tune->vec_costs->vec_stmt_cost;
 
       case vector_load:
+      case vector_gather_load:
         return current_tune->vec_costs->vec_align_load_cost;
 
       case vector_store:
+      case vector_scatter_store:
         return current_tune->vec_costs->vec_store_cost;
 
       case vec_to_scalar:
Index: config/powerpcspe/powerpcspe.c
===================================================================
--- config/powerpcspe/powerpcspe.c	(revision 253789)
+++ config/powerpcspe/powerpcspe.c	(working copy)
@@ -5834,6 +5834,8 @@  rs6000_builtin_vectorization_cost (enum
       case vector_stmt:
       case vector_load:
       case vector_store:
+      case vector_gather_load:
+      case vector_scatter_store:
       case vec_to_scalar:
       case scalar_to_vec:
       case cond_branch_not_taken:
Index: config/rs6000/rs6000.c
===================================================================
--- config/rs6000/rs6000.c	(revision 253789)
+++ config/rs6000/rs6000.c	(working copy)
@@ -5398,6 +5398,8 @@  rs6000_builtin_vectorization_cost (enum
       case vector_stmt:
       case vector_load:
       case vector_store:
+      case vector_gather_load:
+      case vector_scatter_store:
       case vec_to_scalar:
       case scalar_to_vec:
       case cond_branch_not_taken:
Index: config/s390/s390.c
===================================================================
--- config/s390/s390.c	(revision 253789)
+++ config/s390/s390.c	(working copy)
@@ -3717,6 +3717,8 @@  s390_builtin_vectorization_cost (enum ve
       case vector_stmt:
       case vector_load:
       case vector_store:
+      case vector_gather_load:
+      case vector_scatter_store:
       case vec_to_scalar:
       case scalar_to_vec:
       case cond_branch_not_taken:
Index: config/spu/spu.c
===================================================================
--- config/spu/spu.c	(revision 253789)
+++ config/spu/spu.c	(working copy)
@@ -6625,6 +6625,8 @@  spu_builtin_vectorization_cost (enum vec
       case vector_stmt:
       case vector_load:
       case vector_store:
+      case vector_gather_load:
+      case vector_scatter_store:
       case vec_to_scalar:
       case scalar_to_vec:
       case cond_branch_not_taken:
Index: target.h
===================================================================
--- target.h	(revision 253789)
+++ target.h	(working copy)
@@ -171,9 +171,11 @@  enum vect_cost_for_stmt
   scalar_store,
   vector_stmt,
   vector_load,
+  vector_gather_load,
   unaligned_load,
   unaligned_store,
   vector_store,
+  vector_scatter_store,
   vec_to_scalar,
   scalar_to_vec,
   cond_branch_not_taken,
Index: tree-vect-stmts.c
===================================================================
--- tree-vect-stmts.c	(revision 253789)
+++ tree-vect-stmts.c	(working copy)
@@ -95,6 +95,12 @@  record_stmt_cost (stmt_vector_for_cost *
 		  enum vect_cost_for_stmt kind, stmt_vec_info stmt_info,
 		  int misalign, enum vect_cost_model_location where)
 {
+  if ((kind == vector_load || kind == unaligned_load)
+      && STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+    kind = vector_gather_load;
+  if ((kind == vector_store || kind == unaligned_store)
+      && STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+    kind = vector_scatter_store;
   if (body_cost_vec)
     {
       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;