diff mbox

[PATCH/VECT/AARCH64] Improve cost model for ThunderX2 CN99xx

Message ID CA+=Sn1nDJF0d9kBbRCsm9aqh7g=N9RZC7LJQ6nE_yLKLvWwe7Q@mail.gmail.com
State New
Headers show

Commit Message

Andrew Pinski Jan. 31, 2017, 10:34 p.m. UTC
On Sat, Jan 28, 2017 at 12:34 PM, Andrew Pinski <apinski@cavium.com> wrote:
> Hi,
>   On some (most) AARCH64 cores, it is not always profitable to
> vectorize some integer loops.  This patch does two things (I can split
> it into different patches if needed).
> 1) It splits the aarch64 back-end's vector cost model's vector and
> scalar costs into int and fp fields
> 1a) For thunderx2t99, models correctly the integer vector/scalar costs.
> 2) Fixes/Improves a few calls to record_stmt_cost in tree-vect-loop.c
> where stmt_info was not being passed.
>
> OK?  Bootstrapped and tested on aarch64-linux-gnu and provides 20% on
> libquantum and ~1% overall on SPEC CPU 2006 int.

Here is the updated patch with the fixes requested by both Richards.
Still the same performance as above.

OK?

Thanks,
Andrew

ChangLog:
* tree-vect-loop.c (vect_compute_single_scalar_iteration_cost): Pass
stmt_info to record_stmt_cost.
(vect_get_known_peeling_cost): Pass stmt_info if known to record_stmt_cost.

* config/aarch64/aarch64-protos.h (cpu_vector_cost): Split
cpu_vector_cost field into
scalar_int_stmt_cost and scalar_fp_stmt_cost.  Split vec_stmt_cost
field into vec_int_stmt_cost and vec_fp_stmt_cost.
* config/aarch64/aarch64.c (generic_vector_cost): Update for the
splitting of scalar_stmt_cost and vec_stmt_cost.
(thunderx_vector_cost): Likewise.
(cortexa57_vector_cost): LIkewise.
(exynosm1_vector_cost): Likewise.
(xgene1_vector_cost): Likewise.
(thunderx2t99_vector_cost): Improve after the splitting of the two fields.
(aarch64_builtin_vectorization_cost): Update for the splitting of
scalar_stmt_cost and vec_stmt_cost.

>
> Thanks,
> Andrew Pinski
>
> ChangeLog:
> * tree-vect-loop.c (vect_compute_single_scalar_iteration_cost): Pass
> stmt_info to record_stmt_cost.
> (vect_get_known_peeling_cost): Pass stmt_info if known to record_stmt_cost.
>
> * config/aarch64/aarch64-protos.h (cpu_vector_cost): Split
> cpu_vector_cost field into
> scalar_int_stmt_cost and scalar_fp_stmt_cost.  Split vec_stmt_cost
> field into vec_int_stmt_cost and vec_fp_stmt_cost.
> * config/aarch64/aarch64.c (generic_vector_cost): Update for the
> splitting of scalar_stmt_cost and vec_stmt_cost.
> (thunderx_vector_cost): Likewise.
> (cortexa57_vector_cost): LIkewise.
> (exynosm1_vector_cost): Likewise.
> (xgene1_vector_cost): Likewise.
> (thunderx2t99_vector_cost): Improve after the splitting of the two fields.
> (aarch64_builtin_vectorization_cost): Update for the splitting of
> scalar_stmt_cost and vec_stmt_cost.

Comments

Richard Earnshaw (lists) Feb. 1, 2017, 10:02 a.m. UTC | #1
On 31/01/17 22:34, Andrew Pinski wrote:
> On Sat, Jan 28, 2017 at 12:34 PM, Andrew Pinski <apinski@cavium.com> wrote:
>> Hi,
>>   On some (most) AARCH64 cores, it is not always profitable to
>> vectorize some integer loops.  This patch does two things (I can split
>> it into different patches if needed).
>> 1) It splits the aarch64 back-end's vector cost model's vector and
>> scalar costs into int and fp fields
>> 1a) For thunderx2t99, models correctly the integer vector/scalar costs.
>> 2) Fixes/Improves a few calls to record_stmt_cost in tree-vect-loop.c
>> where stmt_info was not being passed.
>>
>> OK?  Bootstrapped and tested on aarch64-linux-gnu and provides 20% on
>> libquantum and ~1% overall on SPEC CPU 2006 int.
> 
> Here is the updated patch with the fixes requested by both Richards.
> Still the same performance as above.
> 
> OK?
> 
> Thanks,
> Andrew
> 
> ChangLog:
> * tree-vect-loop.c (vect_compute_single_scalar_iteration_cost): Pass
> stmt_info to record_stmt_cost.
> (vect_get_known_peeling_cost): Pass stmt_info if known to record_stmt_cost.
> 
> * config/aarch64/aarch64-protos.h (cpu_vector_cost): Split
> cpu_vector_cost field into
> scalar_int_stmt_cost and scalar_fp_stmt_cost.  Split vec_stmt_cost
> field into vec_int_stmt_cost and vec_fp_stmt_cost.
> * config/aarch64/aarch64.c (generic_vector_cost): Update for the
> splitting of scalar_stmt_cost and vec_stmt_cost.
> (thunderx_vector_cost): Likewise.
> (cortexa57_vector_cost): LIkewise.
> (exynosm1_vector_cost): Likewise.
> (xgene1_vector_cost): Likewise.
> (thunderx2t99_vector_cost): Improve after the splitting of the two fields.
> (aarch64_builtin_vectorization_cost): Update for the splitting of
> scalar_stmt_cost and vec_stmt_cost.
> 
>>
>> Thanks,
>> Andrew Pinski
>>
>> ChangeLog:
>> * tree-vect-loop.c (vect_compute_single_scalar_iteration_cost): Pass
>> stmt_info to record_stmt_cost.
>> (vect_get_known_peeling_cost): Pass stmt_info if known to record_stmt_cost.
>>
>> * config/aarch64/aarch64-protos.h (cpu_vector_cost): Split
>> cpu_vector_cost field into
>> scalar_int_stmt_cost and scalar_fp_stmt_cost.  Split vec_stmt_cost
>> field into vec_int_stmt_cost and vec_fp_stmt_cost.
>> * config/aarch64/aarch64.c (generic_vector_cost): Update for the
>> splitting of scalar_stmt_cost and vec_stmt_cost.
>> (thunderx_vector_cost): Likewise.
>> (cortexa57_vector_cost): LIkewise.
>> (exynosm1_vector_cost): Likewise.
>> (xgene1_vector_cost): Likewise.
>> (thunderx2t99_vector_cost): Improve after the splitting of the two fields.
>> (aarch64_builtin_vectorization_cost): Update for the splitting of
>> scalar_stmt_cost and vec_stmt_cost.
>>

OK.

R.

>> updatedvectcost.diff.txt
>>
>>
>> Index: config/aarch64/aarch64-protos.h
>> ===================================================================
>> --- config/aarch64/aarch64-protos.h	(revision 245070)
>> +++ config/aarch64/aarch64-protos.h	(working copy)
>> @@ -151,11 +151,17 @@ struct cpu_regmove_cost
>>  /* Cost for vector insn classes.  */
>>  struct cpu_vector_cost
>>  {
>> -  const int scalar_stmt_cost;		 /* Cost of any scalar operation,
>> +  const int scalar_int_stmt_cost;	 /* Cost of any int scalar operation,
>> +					    excluding load and store.  */
>> +  const int scalar_fp_stmt_cost;	 /* Cost of any fp scalar operation,
>>  					    excluding load and store.  */
>>    const int scalar_load_cost;		 /* Cost of scalar load.  */
>>    const int scalar_store_cost;		 /* Cost of scalar store.  */
>> -  const int vec_stmt_cost;		 /* Cost of any vector operation,
>> +  const int vec_int_stmt_cost;		 /* Cost of any int vector operation,
>> +					    excluding load, store, permute,
>> +					    vector-to-scalar and
>> +					    scalar-to-vector operation.  */
>> +  const int vec_fp_stmt_cost;		 /* Cost of any fp vector operation,
>>  					    excluding load, store, permute,
>>  					    vector-to-scalar and
>>  					    scalar-to-vector operation.  */
>> Index: config/aarch64/aarch64.c
>> ===================================================================
>> --- config/aarch64/aarch64.c	(revision 245070)
>> +++ config/aarch64/aarch64.c	(working copy)
>> @@ -365,10 +365,12 @@ static const struct cpu_regmove_cost thu
>>  /* Generic costs for vector insn classes.  */
>>  static const struct cpu_vector_cost generic_vector_cost =
>>  {
>> -  1, /* scalar_stmt_cost  */
>> +  1, /* scalar_int_stmt_cost  */
>> +  1, /* scalar_fp_stmt_cost  */
>>    1, /* scalar_load_cost  */
>>    1, /* scalar_store_cost  */
>> -  1, /* vec_stmt_cost  */
>> +  1, /* vec_int_stmt_cost  */
>> +  1, /* vec_fp_stmt_cost  */
>>    2, /* vec_permute_cost  */
>>    1, /* vec_to_scalar_cost  */
>>    1, /* scalar_to_vec_cost  */
>> @@ -383,10 +385,12 @@ static const struct cpu_vector_cost gene
>>  /* ThunderX costs for vector insn classes.  */
>>  static const struct cpu_vector_cost thunderx_vector_cost =
>>  {
>> -  1, /* scalar_stmt_cost  */
>> +  1, /* scalar_int_stmt_cost  */
>> +  1, /* scalar_fp_stmt_cost  */
>>    3, /* scalar_load_cost  */
>>    1, /* scalar_store_cost  */
>> -  4, /* vec_stmt_cost  */
>> +  4, /* vec_int_stmt_cost  */
>> +  4, /* vec_fp_stmt_cost  */
>>    4, /* vec_permute_cost  */
>>    2, /* vec_to_scalar_cost  */
>>    2, /* scalar_to_vec_cost  */
>> @@ -401,10 +405,12 @@ static const struct cpu_vector_cost thun
>>  /* Generic costs for vector insn classes.  */
>>  static const struct cpu_vector_cost cortexa57_vector_cost =
>>  {
>> -  1, /* scalar_stmt_cost  */
>> +  1, /* scalar_int_stmt_cost  */
>> +  1, /* scalar_fp_stmt_cost  */
>>    4, /* scalar_load_cost  */
>>    1, /* scalar_store_cost  */
>> -  2, /* vec_stmt_cost  */
>> +  2, /* vec_int_stmt_cost  */
>> +  2, /* vec_fp_stmt_cost  */
>>    3, /* vec_permute_cost  */
>>    8, /* vec_to_scalar_cost  */
>>    8, /* scalar_to_vec_cost  */
>> @@ -418,10 +424,12 @@ static const struct cpu_vector_cost cort
>>  
>>  static const struct cpu_vector_cost exynosm1_vector_cost =
>>  {
>> -  1, /* scalar_stmt_cost  */
>> +  1, /* scalar_int_stmt_cost  */
>> +  1, /* scalar_fp_stmt_cost  */
>>    5, /* scalar_load_cost  */
>>    1, /* scalar_store_cost  */
>> -  3, /* vec_stmt_cost  */
>> +  3, /* vec_int_stmt_cost  */
>> +  3, /* vec_fp_stmt_cost  */
>>    3, /* vec_permute_cost  */
>>    3, /* vec_to_scalar_cost  */
>>    3, /* scalar_to_vec_cost  */
>> @@ -436,10 +444,12 @@ static const struct cpu_vector_cost exyn
>>  /* Generic costs for vector insn classes.  */
>>  static const struct cpu_vector_cost xgene1_vector_cost =
>>  {
>> -  1, /* scalar_stmt_cost  */
>> +  1, /* scalar_int_stmt_cost  */
>> +  1, /* scalar_fp_stmt_cost  */
>>    5, /* scalar_load_cost  */
>>    1, /* scalar_store_cost  */
>> -  2, /* vec_stmt_cost  */
>> +  2, /* vec_int_stmt_cost  */
>> +  2, /* vec_fp_stmt_cost  */
>>    2, /* vec_permute_cost  */
>>    4, /* vec_to_scalar_cost  */
>>    4, /* scalar_to_vec_cost  */
>> @@ -454,10 +464,12 @@ static const struct cpu_vector_cost xgen
>>  /* Costs for vector insn classes for Vulcan.  */
>>  static const struct cpu_vector_cost thunderx2t99_vector_cost =
>>  {
>> -  6, /* scalar_stmt_cost  */
>> +  1, /* scalar_int_stmt_cost  */
>> +  6, /* scalar_fp_stmt_cost  */
>>    4, /* scalar_load_cost  */
>>    1, /* scalar_store_cost  */
>> -  6, /* vec_stmt_cost  */
>> +  5, /* vec_int_stmt_cost  */
>> +  6, /* vec_fp_stmt_cost  */
>>    3, /* vec_permute_cost  */
>>    6, /* vec_to_scalar_cost  */
>>    5, /* scalar_to_vec_cost  */
>> @@ -8119,50 +8131,55 @@ aarch64_builtin_vectorization_cost (enum
>>  				    int misalign ATTRIBUTE_UNUSED)
>>  {
>>    unsigned elements;
>> +  const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
>> +  bool fp = false;
>> +
>> +  if (vectype != NULL)
>> +    fp = FLOAT_TYPE_P (vectype);
>>  
>>    switch (type_of_cost)
>>      {
>>        case scalar_stmt:
>> -	return aarch64_tune_params.vec_costs->scalar_stmt_cost;
>> +	return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
>>  
>>        case scalar_load:
>> -	return aarch64_tune_params.vec_costs->scalar_load_cost;
>> +	return costs->scalar_load_cost;
>>  
>>        case scalar_store:
>> -	return aarch64_tune_params.vec_costs->scalar_store_cost;
>> +	return costs->scalar_store_cost;
>>  
>>        case vector_stmt:
>> -	return aarch64_tune_params.vec_costs->vec_stmt_cost;
>> +	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
>>  
>>        case vector_load:
>> -	return aarch64_tune_params.vec_costs->vec_align_load_cost;
>> +	return costs->vec_align_load_cost;
>>  
>>        case vector_store:
>> -	return aarch64_tune_params.vec_costs->vec_store_cost;
>> +	return costs->vec_store_cost;
>>  
>>        case vec_to_scalar:
>> -	return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
>> +	return costs->vec_to_scalar_cost;
>>  
>>        case scalar_to_vec:
>> -	return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
>> +	return costs->scalar_to_vec_cost;
>>  
>>        case unaligned_load:
>> -	return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
>> +	return costs->vec_unalign_load_cost;
>>  
>>        case unaligned_store:
>> -	return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
>> +	return costs->vec_unalign_store_cost;
>>  
>>        case cond_branch_taken:
>> -	return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
>> +	return costs->cond_taken_branch_cost;
>>  
>>        case cond_branch_not_taken:
>> -	return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
>> +	return costs->cond_not_taken_branch_cost;
>>  
>>        case vec_perm:
>> -	return aarch64_tune_params.vec_costs->vec_permute_cost;
>> +	return costs->vec_permute_cost;
>>  
>>        case vec_promote_demote:
>> -	return aarch64_tune_params.vec_costs->vec_stmt_cost;
>> +	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
>>  
>>        case vec_construct:
>>          elements = TYPE_VECTOR_SUBPARTS (vectype);
>> Index: tree-vect-loop.c
>> ===================================================================
>> --- tree-vect-loop.c	(revision 245070)
>> +++ tree-vect-loop.c	(working copy)
>> @@ -1329,9 +1329,9 @@ vect_compute_single_scalar_iteration_cos
>>              continue;
>>  
>>  	  vect_cost_for_stmt kind;
>> -          if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
>> +          if (STMT_VINFO_DATA_REF (stmt_info))
>>              {
>> -              if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
>> +              if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
>>                 kind = scalar_load;
>>               else
>>                 kind = scalar_store;
>> @@ -1341,7 +1341,7 @@ vect_compute_single_scalar_iteration_cos
>>  
>>  	  scalar_single_iter_cost
>>  	    += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
>> -				 factor, kind, NULL, 0, vect_prologue);
>> +				 factor, kind, stmt_info, 0, vect_prologue);
>>          }
>>      }
>>    LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
>> @@ -3178,16 +3178,24 @@ vect_get_known_peeling_cost (loop_vec_in
>>    int j;
>>    if (peel_iters_prologue)
>>      FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
>> -      retval += record_stmt_cost (prologue_cost_vec,
>> -				  si->count * peel_iters_prologue,
>> -				  si->kind, NULL, si->misalign,
>> -				  vect_prologue);
>> +	{
>> +	  stmt_vec_info stmt_info
>> +	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
>> +	  retval += record_stmt_cost (prologue_cost_vec,
>> +				      si->count * peel_iters_prologue,
>> +				      si->kind, stmt_info, si->misalign,
>> +				      vect_prologue);
>> +	}
>>    if (*peel_iters_epilogue)
>>      FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
>> -      retval += record_stmt_cost (epilogue_cost_vec,
>> -				  si->count * *peel_iters_epilogue,
>> -				  si->kind, NULL, si->misalign,
>> -				  vect_epilogue);
>> +	{
>> +	  stmt_vec_info stmt_info
>> +	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
>> +	  retval += record_stmt_cost (epilogue_cost_vec,
>> +				      si->count * *peel_iters_epilogue,
>> +				      si->kind, stmt_info, si->misalign,
>> +				      vect_epilogue);
>> +	}
>>  
>>    return retval;
>>  }
diff mbox

Patch

Index: config/aarch64/aarch64-protos.h
===================================================================
--- config/aarch64/aarch64-protos.h	(revision 245070)
+++ config/aarch64/aarch64-protos.h	(working copy)
@@ -151,11 +151,17 @@  struct cpu_regmove_cost
 /* Cost for vector insn classes.  */
 struct cpu_vector_cost
 {
-  const int scalar_stmt_cost;		 /* Cost of any scalar operation,
+  const int scalar_int_stmt_cost;	 /* Cost of any int scalar operation,
+					    excluding load and store.  */
+  const int scalar_fp_stmt_cost;	 /* Cost of any fp scalar operation,
 					    excluding load and store.  */
   const int scalar_load_cost;		 /* Cost of scalar load.  */
   const int scalar_store_cost;		 /* Cost of scalar store.  */
-  const int vec_stmt_cost;		 /* Cost of any vector operation,
+  const int vec_int_stmt_cost;		 /* Cost of any int vector operation,
+					    excluding load, store, permute,
+					    vector-to-scalar and
+					    scalar-to-vector operation.  */
+  const int vec_fp_stmt_cost;		 /* Cost of any fp vector operation,
 					    excluding load, store, permute,
 					    vector-to-scalar and
 					    scalar-to-vector operation.  */
Index: config/aarch64/aarch64.c
===================================================================
--- config/aarch64/aarch64.c	(revision 245070)
+++ config/aarch64/aarch64.c	(working copy)
@@ -365,10 +365,12 @@  static const struct cpu_regmove_cost thu
 /* Generic costs for vector insn classes.  */
 static const struct cpu_vector_cost generic_vector_cost =
 {
-  1, /* scalar_stmt_cost  */
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
   1, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
-  1, /* vec_stmt_cost  */
+  1, /* vec_int_stmt_cost  */
+  1, /* vec_fp_stmt_cost  */
   2, /* vec_permute_cost  */
   1, /* vec_to_scalar_cost  */
   1, /* scalar_to_vec_cost  */
@@ -383,10 +385,12 @@  static const struct cpu_vector_cost gene
 /* ThunderX costs for vector insn classes.  */
 static const struct cpu_vector_cost thunderx_vector_cost =
 {
-  1, /* scalar_stmt_cost  */
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
   3, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
-  4, /* vec_stmt_cost  */
+  4, /* vec_int_stmt_cost  */
+  4, /* vec_fp_stmt_cost  */
   4, /* vec_permute_cost  */
   2, /* vec_to_scalar_cost  */
   2, /* scalar_to_vec_cost  */
@@ -401,10 +405,12 @@  static const struct cpu_vector_cost thun
 /* Generic costs for vector insn classes.  */
 static const struct cpu_vector_cost cortexa57_vector_cost =
 {
-  1, /* scalar_stmt_cost  */
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
   4, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
-  2, /* vec_stmt_cost  */
+  2, /* vec_int_stmt_cost  */
+  2, /* vec_fp_stmt_cost  */
   3, /* vec_permute_cost  */
   8, /* vec_to_scalar_cost  */
   8, /* scalar_to_vec_cost  */
@@ -418,10 +424,12 @@  static const struct cpu_vector_cost cort
 
 static const struct cpu_vector_cost exynosm1_vector_cost =
 {
-  1, /* scalar_stmt_cost  */
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
   5, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
-  3, /* vec_stmt_cost  */
+  3, /* vec_int_stmt_cost  */
+  3, /* vec_fp_stmt_cost  */
   3, /* vec_permute_cost  */
   3, /* vec_to_scalar_cost  */
   3, /* scalar_to_vec_cost  */
@@ -436,10 +444,12 @@  static const struct cpu_vector_cost exyn
 /* Generic costs for vector insn classes.  */
 static const struct cpu_vector_cost xgene1_vector_cost =
 {
-  1, /* scalar_stmt_cost  */
+  1, /* scalar_int_stmt_cost  */
+  1, /* scalar_fp_stmt_cost  */
   5, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
-  2, /* vec_stmt_cost  */
+  2, /* vec_int_stmt_cost  */
+  2, /* vec_fp_stmt_cost  */
   2, /* vec_permute_cost  */
   4, /* vec_to_scalar_cost  */
   4, /* scalar_to_vec_cost  */
@@ -454,10 +464,12 @@  static const struct cpu_vector_cost xgen
 /* Costs for vector insn classes for Vulcan.  */
 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 {
-  6, /* scalar_stmt_cost  */
+  1, /* scalar_int_stmt_cost  */
+  6, /* scalar_fp_stmt_cost  */
   4, /* scalar_load_cost  */
   1, /* scalar_store_cost  */
-  6, /* vec_stmt_cost  */
+  5, /* vec_int_stmt_cost  */
+  6, /* vec_fp_stmt_cost  */
   3, /* vec_permute_cost  */
   6, /* vec_to_scalar_cost  */
   5, /* scalar_to_vec_cost  */
@@ -8119,50 +8131,55 @@  aarch64_builtin_vectorization_cost (enum
 				    int misalign ATTRIBUTE_UNUSED)
 {
   unsigned elements;
+  const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
+  bool fp = false;
+
+  if (vectype != NULL)
+    fp = FLOAT_TYPE_P (vectype);
 
   switch (type_of_cost)
     {
       case scalar_stmt:
-	return aarch64_tune_params.vec_costs->scalar_stmt_cost;
+	return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
 
       case scalar_load:
-	return aarch64_tune_params.vec_costs->scalar_load_cost;
+	return costs->scalar_load_cost;
 
       case scalar_store:
-	return aarch64_tune_params.vec_costs->scalar_store_cost;
+	return costs->scalar_store_cost;
 
       case vector_stmt:
-	return aarch64_tune_params.vec_costs->vec_stmt_cost;
+	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
 
       case vector_load:
-	return aarch64_tune_params.vec_costs->vec_align_load_cost;
+	return costs->vec_align_load_cost;
 
       case vector_store:
-	return aarch64_tune_params.vec_costs->vec_store_cost;
+	return costs->vec_store_cost;
 
       case vec_to_scalar:
-	return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
+	return costs->vec_to_scalar_cost;
 
       case scalar_to_vec:
-	return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
+	return costs->scalar_to_vec_cost;
 
       case unaligned_load:
-	return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
+	return costs->vec_unalign_load_cost;
 
       case unaligned_store:
-	return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
+	return costs->vec_unalign_store_cost;
 
       case cond_branch_taken:
-	return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
+	return costs->cond_taken_branch_cost;
 
       case cond_branch_not_taken:
-	return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
+	return costs->cond_not_taken_branch_cost;
 
       case vec_perm:
-	return aarch64_tune_params.vec_costs->vec_permute_cost;
+	return costs->vec_permute_cost;
 
       case vec_promote_demote:
-	return aarch64_tune_params.vec_costs->vec_stmt_cost;
+	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
 
       case vec_construct:
         elements = TYPE_VECTOR_SUBPARTS (vectype);
Index: tree-vect-loop.c
===================================================================
--- tree-vect-loop.c	(revision 245070)
+++ tree-vect-loop.c	(working copy)
@@ -1329,9 +1329,9 @@  vect_compute_single_scalar_iteration_cos
             continue;
 
 	  vect_cost_for_stmt kind;
-          if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
+          if (STMT_VINFO_DATA_REF (stmt_info))
             {
-              if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
+              if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
                kind = scalar_load;
              else
                kind = scalar_store;
@@ -1341,7 +1341,7 @@  vect_compute_single_scalar_iteration_cos
 
 	  scalar_single_iter_cost
 	    += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
-				 factor, kind, NULL, 0, vect_prologue);
+				 factor, kind, stmt_info, 0, vect_prologue);
         }
     }
   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
@@ -3178,16 +3178,24 @@  vect_get_known_peeling_cost (loop_vec_in
   int j;
   if (peel_iters_prologue)
     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
-      retval += record_stmt_cost (prologue_cost_vec,
-				  si->count * peel_iters_prologue,
-				  si->kind, NULL, si->misalign,
-				  vect_prologue);
+	{
+	  stmt_vec_info stmt_info
+	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
+	  retval += record_stmt_cost (prologue_cost_vec,
+				      si->count * peel_iters_prologue,
+				      si->kind, stmt_info, si->misalign,
+				      vect_prologue);
+	}
   if (*peel_iters_epilogue)
     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
-      retval += record_stmt_cost (epilogue_cost_vec,
-				  si->count * *peel_iters_epilogue,
-				  si->kind, NULL, si->misalign,
-				  vect_epilogue);
+	{
+	  stmt_vec_info stmt_info
+	    = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
+	  retval += record_stmt_cost (epilogue_cost_vec,
+				      si->count * *peel_iters_epilogue,
+				      si->kind, stmt_info, si->misalign,
+				      vect_epilogue);
+	}
 
   return retval;
 }