Patchwork [4.6] Backport fixes for PR50031, PR50969

login
register
mail settings
Submitter William J. Schmidt
Date Feb. 10, 2012, 8:58 p.m.
Message ID <1328907521.18863.6.camel@gnopaine>
Download mbox | patch
Permalink /patch/140720/
State New
Headers show

Comments

William J. Schmidt - Feb. 10, 2012, 8:58 p.m.
This patch backports the two recent trunk fixes for powerpc64
vectorization degradations.  The fixes are largely identical to their
4.7 counterparts except that (a) the logic for
STMT_VINFO_PATTERN_DEF_SEQ does not apply in 4.6, and (b) the changes to
vectorizable_conversion in 4.7 correspond to changes in
vectorizable_type_demotion and vectorizable_type_promotion in 4.6.

Bootstrapped and tested for regressions and performance for
powerpc64-linux.  OK to commit after the trunk patch has a few days of
burn-in?

Thanks,
Bill


2012-02-10  Bill Schmidt <wschmidt@linux.vnet.ibm.com>
	    Ira Rosen <irar@il.ibm.com>

	PR tree-optimization/50031
	PR tree-optimization/50969
	* targhooks.c (default_builtin_vectorization_cost): Handle
	vec_promote_demote.
	* target.h (enum vect_cost_for_stmt): Add vec_promote_demote.
	* tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle
	all types of reduction and pattern statements.
	(vect_estimate_min_profitable_iters): Likewise.
	* tree-vect-stmts.c (vect_model_promotion_demotion_cost): New function.
	(vect_model_store_cost): Use vec_perm rather than vector_stmt for
	statement cost.
	(vect_model_load_cost): Likewise.
	(vect_get_load_cost): Likewise; add dump logic for explicit realigns.
	(vectorizable_type_demotion): Call vect_model_promotion_demotion_cost.
	(vectorizable_type_promotion): Likewise.
	* config/spu/spu.c (spu_builtin_vectorization_cost): Handle
	vec_promote_demote.
	* config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
	* config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Update
	vec_perm for VSX and handle vec_promote_demote.
William J. Schmidt - Feb. 16, 2012, 1:17 p.m.
Greetings,

Given the recent discussion on getting 4.6 cleaned up, I thought I'd
check back on this one.  Thanks!

Bill

On Fri, 2012-02-10 at 14:58 -0600, William J. Schmidt wrote:
> This patch backports the two recent trunk fixes for powerpc64
> vectorization degradations.  The fixes are largely identical to their
> 4.7 counterparts except that (a) the logic for
> STMT_VINFO_PATTERN_DEF_SEQ does not apply in 4.6, and (b) the changes to
> vectorizable_conversion in 4.7 correspond to changes in
> vectorizable_type_demotion and vectorizable_type_promotion in 4.6.
> 
> Bootstrapped and tested for regressions and performance for
> powerpc64-linux.  OK to commit after the trunk patch has a few days of
> burn-in?
> 
> Thanks,
> Bill
> 
> 
> 2012-02-10  Bill Schmidt <wschmidt@linux.vnet.ibm.com>
> 	    Ira Rosen <irar@il.ibm.com>
> 
> 	PR tree-optimization/50031
> 	PR tree-optimization/50969
> 	* targhooks.c (default_builtin_vectorization_cost): Handle
> 	vec_promote_demote.
> 	* target.h (enum vect_cost_for_stmt): Add vec_promote_demote.
> 	* tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle
> 	all types of reduction and pattern statements.
> 	(vect_estimate_min_profitable_iters): Likewise.
> 	* tree-vect-stmts.c (vect_model_promotion_demotion_cost): New function.
> 	(vect_model_store_cost): Use vec_perm rather than vector_stmt for
> 	statement cost.
> 	(vect_model_load_cost): Likewise.
> 	(vect_get_load_cost): Likewise; add dump logic for explicit realigns.
> 	(vectorizable_type_demotion): Call vect_model_promotion_demotion_cost.
> 	(vectorizable_type_promotion): Likewise.
> 	* config/spu/spu.c (spu_builtin_vectorization_cost): Handle
> 	vec_promote_demote.
> 	* config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
> 	* config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Update
> 	vec_perm for VSX and handle vec_promote_demote.
> 
> 
> Index: gcc/targhooks.c
> ===================================================================
> --- gcc/targhooks.c	(revision 184047)
> +++ gcc/targhooks.c	(working copy)
> @@ -529,6 +529,7 @@ default_builtin_vectorization_cost (enum vect_cost
>        case scalar_to_vec:
>        case cond_branch_not_taken:
>        case vec_perm:
> +      case vec_promote_demote:
>          return 1;
>  
>        case unaligned_load:
> Index: gcc/target.h
> ===================================================================
> --- gcc/target.h	(revision 184047)
> +++ gcc/target.h	(working copy)
> @@ -128,7 +128,8 @@ enum vect_cost_for_stmt
>    scalar_to_vec,
>    cond_branch_not_taken,
>    cond_branch_taken,
> -  vec_perm
> +  vec_perm,
> +  vec_promote_demote
>  };
>  
>  /* Sets of optimization levels at which an option may be enabled by
> Index: gcc/tree-vect-loop.c
> ===================================================================
> --- gcc/tree-vect-loop.c	(revision 184047)
> +++ gcc/tree-vect-loop.c	(working copy)
> @@ -2104,7 +2104,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf
>            if (stmt_info
>                && !STMT_VINFO_RELEVANT_P (stmt_info)
>                && (!STMT_VINFO_LIVE_P (stmt_info)
> -                  || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
> +                  || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
> +	      && !STMT_VINFO_IN_PATTERN_P (stmt_info))
>              continue;
>  
>            if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
> @@ -2251,11 +2252,19 @@ vect_estimate_min_profitable_iters (loop_vec_info
>  	{
>  	  gimple stmt = gsi_stmt (si);
>  	  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> +
> +	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
> +	    {
> +	      stmt = STMT_VINFO_RELATED_STMT (stmt_info);
> +	      stmt_info = vinfo_for_stmt (stmt);
> +	    }
> +
>  	  /* Skip stmts that are not vectorized inside the loop.  */
>  	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
>  	      && (!STMT_VINFO_LIVE_P (stmt_info)
> -		  || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
> +		  || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))))
>  	    continue;
> +
>  	  vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
>  	  /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
>  	     some of the "outside" costs are generated inside the outer-loop.  */
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c	(revision 184047)
> +++ gcc/tree-vect-stmts.c	(working copy)
> @@ -623,6 +623,46 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i
>  }
>  
> 
> +/* Model cost for type demotion and promotion operations.  PWR is normally
> +   zero for single-step promotions and demotions.  It will be one if 
> +   two-step promotion/demotion is required, and so on.  Each additional
> +   step doubles the number of instructions required.  */
> +
> +static void
> +vect_model_promotion_demotion_cost (stmt_vec_info stmt_info,
> +				    enum vect_def_type *dt, int pwr)
> +{
> +  int i, tmp;
> +  int inside_cost = 0, outside_cost = 0, single_stmt_cost;
> +
> +  /* The SLP costs were already calculated during SLP tree build.  */
> +  if (PURE_SLP_STMT (stmt_info))
> +    return;
> +
> +  single_stmt_cost = vect_get_stmt_cost (vec_promote_demote);
> +  for (i = 0; i < pwr + 1; i++)
> +    {
> +      tmp = (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type) ?
> +	(i + 1) : i;
> +      inside_cost += vect_pow2 (tmp) * single_stmt_cost;
> +    }
> +
> +  /* FORNOW: Assuming maximum 2 args per stmts.  */
> +  for (i = 0; i < 2; i++)
> +    {
> +      if (dt[i] == vect_constant_def || dt[i] == vect_external_def)
> +        outside_cost += vect_get_stmt_cost (vector_stmt);
> +    }
> +
> +  if (vect_print_dump_info (REPORT_COST))
> +    fprintf (vect_dump, "vect_model_promotion_demotion_cost: inside_cost = %d, "
> +             "outside_cost = %d .", inside_cost, outside_cost);
> +
> +  /* Set the costs in STMT_INFO.  */
> +  stmt_vinfo_set_inside_of_loop_cost (stmt_info, NULL, inside_cost);
> +  stmt_vinfo_set_outside_of_loop_cost (stmt_info, NULL, outside_cost);
> +}
> +
>  /* Function vect_cost_strided_group_size
>  
>     For strided load or store, return the group_size only if it is the first
> @@ -691,7 +731,7 @@ vect_model_store_cost (stmt_vec_info stmt_info, in
>      {
>        /* Uses a high and low interleave operation for each needed permute.  */
>        inside_cost = ncopies * exact_log2(group_size) * group_size
> -        * vect_get_stmt_cost (vector_stmt);
> +        * vect_get_stmt_cost (vec_perm);
>  
>        if (vect_print_dump_info (REPORT_COST))
>          fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
> @@ -795,7 +835,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
>      {
>        /* Uses an even and odd extract operations for each needed permute.  */
>        inside_cost = ncopies * exact_log2(group_size) * group_size
> -	* vect_get_stmt_cost (vector_stmt);
> +	* vect_get_stmt_cost (vec_perm);
>  
>        if (vect_print_dump_info (REPORT_COST))
>          fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
> @@ -855,7 +895,7 @@ vect_get_load_cost (struct data_reference *dr, int
>      case dr_explicit_realign:
>        {
>          *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load)
> -           + vect_get_stmt_cost (vector_stmt));
> +				   + vect_get_stmt_cost (vec_perm));
>  
>          /* FIXME: If the misalignment remains fixed across the iterations of
>             the containing loop, the following cost should be added to the
> @@ -863,6 +903,9 @@ vect_get_load_cost (struct data_reference *dr, int
>          if (targetm.vectorize.builtin_mask_for_load)
>            *inside_cost += vect_get_stmt_cost (vector_stmt);
>  
> +        if (vect_print_dump_info (REPORT_COST))
> +          fprintf (vect_dump, "vect_model_load_cost: explicit realign");
> +
>          break;
>        }
>      case dr_explicit_realign_optimized:
> @@ -886,7 +929,12 @@ vect_get_load_cost (struct data_reference *dr, int
>            }
>  
>          *inside_cost += ncopies * (vect_get_stmt_cost (vector_load)
> -          + vect_get_stmt_cost (vector_stmt));
> +				   + vect_get_stmt_cost (vec_perm));
> +
> +        if (vect_print_dump_info (REPORT_COST))
> +          fprintf (vect_dump,
> +		   "vect_model_load_cost: explicit realign optimized");
> +
>          break;
>        }
>  
> @@ -2919,7 +2967,7 @@ vectorizable_type_demotion (gimple stmt, gimple_st
>        STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
>        if (vect_print_dump_info (REPORT_DETAILS))
>          fprintf (vect_dump, "=== vectorizable_demotion ===");
> -      vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
> +      vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt);
>        return true;
>      }
>  
> @@ -3217,7 +3265,7 @@ vectorizable_type_promotion (gimple stmt, gimple_s
>        STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
>        if (vect_print_dump_info (REPORT_DETAILS))
>          fprintf (vect_dump, "=== vectorizable_promotion ===");
> -      vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
> +      vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt);
>        return true;
>      }
>  
> Index: gcc/config/spu/spu.c
> ===================================================================
> --- gcc/config/spu/spu.c	(revision 184047)
> +++ gcc/config/spu/spu.c	(working copy)
> @@ -6794,6 +6794,7 @@ spu_builtin_vectorization_cost (enum vect_cost_for
>        case scalar_to_vec:
>        case cond_branch_not_taken:
>        case vec_perm:
> +      case vec_promote_demote:
>          return 1;
>  
>        case scalar_store:
> Index: gcc/config/i386/i386.c
> ===================================================================
> --- gcc/config/i386/i386.c	(revision 184047)
> +++ gcc/config/i386/i386.c	(working copy)
> @@ -32816,7 +32816,8 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
>          return ix86_cost->cond_not_taken_branch_cost;
>  
>        case vec_perm:
> -        return 1;
> +      case vec_promote_demote:
> +        return ix86_cost->vec_stmt_cost;
>  
>        default:
>          gcc_unreachable ();
> Index: gcc/config/rs6000/rs6000.c
> ===================================================================
> --- gcc/config/rs6000/rs6000.c	(revision 184047)
> +++ gcc/config/rs6000/rs6000.c	(working copy)
> @@ -3695,12 +3695,23 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>        case vec_to_scalar:
>        case scalar_to_vec:
>        case cond_branch_not_taken:
> -      case vec_perm:
>          return 1;
>  
>        case cond_branch_taken:
>          return 3;
>  
> +      case vec_perm:
> +	if (TARGET_VSX)
> +	  return 4;
> +	else
> +	  return 1;
> +
> +      case vec_promote_demote:
> +	if (TARGET_VSX)
> +	  return 5;
> +	else
> +	  return 1;
> +
>        case unaligned_load:
>          if (TARGET_VSX && TARGET_ALLOW_MOVMISALIGN)
>            {
>
Richard Guenther - March 2, 2012, 1:48 p.m.
On Thu, Feb 16, 2012 at 2:17 PM, William J. Schmidt
<wschmidt@linux.vnet.ibm.com> wrote:
> Greetings,
>
> Given the recent discussion on getting 4.6 cleaned up, I thought I'd
> check back on this one.  Thanks!

Ok.

Thanks,
Richard.

> Bill
>
> On Fri, 2012-02-10 at 14:58 -0600, William J. Schmidt wrote:
>> This patch backports the two recent trunk fixes for powerpc64
>> vectorization degradations.  The fixes are largely identical to their
>> 4.7 counterparts except that (a) the logic for
>> STMT_VINFO_PATTERN_DEF_SEQ does not apply in 4.6, and (b) the changes to
>> vectorizable_conversion in 4.7 correspond to changes in
>> vectorizable_type_demotion and vectorizable_type_promotion in 4.6.
>>
>> Bootstrapped and tested for regressions and performance for
>> powerpc64-linux.  OK to commit after the trunk patch has a few days of
>> burn-in?
>>
>> Thanks,
>> Bill
>>
>>
>> 2012-02-10  Bill Schmidt <wschmidt@linux.vnet.ibm.com>
>>           Ira Rosen <irar@il.ibm.com>
>>
>>       PR tree-optimization/50031
>>       PR tree-optimization/50969
>>       * targhooks.c (default_builtin_vectorization_cost): Handle
>>       vec_promote_demote.
>>       * target.h (enum vect_cost_for_stmt): Add vec_promote_demote.
>>       * tree-vect-loop.c (vect_get_single_scalar_iteraion_cost): Handle
>>       all types of reduction and pattern statements.
>>       (vect_estimate_min_profitable_iters): Likewise.
>>       * tree-vect-stmts.c (vect_model_promotion_demotion_cost): New function.
>>       (vect_model_store_cost): Use vec_perm rather than vector_stmt for
>>       statement cost.
>>       (vect_model_load_cost): Likewise.
>>       (vect_get_load_cost): Likewise; add dump logic for explicit realigns.
>>       (vectorizable_type_demotion): Call vect_model_promotion_demotion_cost.
>>       (vectorizable_type_promotion): Likewise.
>>       * config/spu/spu.c (spu_builtin_vectorization_cost): Handle
>>       vec_promote_demote.
>>       * config/i386/i386.c (ix86_builtin_vectorization_cost): Likewise.
>>       * config/rs6000/rs6000.c (rs6000_builtin_vectorization_cost): Update
>>       vec_perm for VSX and handle vec_promote_demote.
>>
>>
>> Index: gcc/targhooks.c
>> ===================================================================
>> --- gcc/targhooks.c   (revision 184047)
>> +++ gcc/targhooks.c   (working copy)
>> @@ -529,6 +529,7 @@ default_builtin_vectorization_cost (enum vect_cost
>>        case scalar_to_vec:
>>        case cond_branch_not_taken:
>>        case vec_perm:
>> +      case vec_promote_demote:
>>          return 1;
>>
>>        case unaligned_load:
>> Index: gcc/target.h
>> ===================================================================
>> --- gcc/target.h      (revision 184047)
>> +++ gcc/target.h      (working copy)
>> @@ -128,7 +128,8 @@ enum vect_cost_for_stmt
>>    scalar_to_vec,
>>    cond_branch_not_taken,
>>    cond_branch_taken,
>> -  vec_perm
>> +  vec_perm,
>> +  vec_promote_demote
>>  };
>>
>>  /* Sets of optimization levels at which an option may be enabled by
>> Index: gcc/tree-vect-loop.c
>> ===================================================================
>> --- gcc/tree-vect-loop.c      (revision 184047)
>> +++ gcc/tree-vect-loop.c      (working copy)
>> @@ -2104,7 +2104,8 @@ vect_get_single_scalar_iteraion_cost (loop_vec_inf
>>            if (stmt_info
>>                && !STMT_VINFO_RELEVANT_P (stmt_info)
>>                && (!STMT_VINFO_LIVE_P (stmt_info)
>> -                  || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
>> +                  || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
>> +           && !STMT_VINFO_IN_PATTERN_P (stmt_info))
>>              continue;
>>
>>            if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
>> @@ -2251,11 +2252,19 @@ vect_estimate_min_profitable_iters (loop_vec_info
>>       {
>>         gimple stmt = gsi_stmt (si);
>>         stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>> +
>> +       if (STMT_VINFO_IN_PATTERN_P (stmt_info))
>> +         {
>> +           stmt = STMT_VINFO_RELATED_STMT (stmt_info);
>> +           stmt_info = vinfo_for_stmt (stmt);
>> +         }
>> +
>>         /* Skip stmts that are not vectorized inside the loop.  */
>>         if (!STMT_VINFO_RELEVANT_P (stmt_info)
>>             && (!STMT_VINFO_LIVE_P (stmt_info)
>> -               || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
>> +               || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))))
>>           continue;
>> +
>>         vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
>>         /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
>>            some of the "outside" costs are generated inside the outer-loop.  */
>> Index: gcc/tree-vect-stmts.c
>> ===================================================================
>> --- gcc/tree-vect-stmts.c     (revision 184047)
>> +++ gcc/tree-vect-stmts.c     (working copy)
>> @@ -623,6 +623,46 @@ vect_model_simple_cost (stmt_vec_info stmt_info, i
>>  }
>>
>>
>> +/* Model cost for type demotion and promotion operations.  PWR is normally
>> +   zero for single-step promotions and demotions.  It will be one if
>> +   two-step promotion/demotion is required, and so on.  Each additional
>> +   step doubles the number of instructions required.  */
>> +
>> +static void
>> +vect_model_promotion_demotion_cost (stmt_vec_info stmt_info,
>> +                                 enum vect_def_type *dt, int pwr)
>> +{
>> +  int i, tmp;
>> +  int inside_cost = 0, outside_cost = 0, single_stmt_cost;
>> +
>> +  /* The SLP costs were already calculated during SLP tree build.  */
>> +  if (PURE_SLP_STMT (stmt_info))
>> +    return;
>> +
>> +  single_stmt_cost = vect_get_stmt_cost (vec_promote_demote);
>> +  for (i = 0; i < pwr + 1; i++)
>> +    {
>> +      tmp = (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type) ?
>> +     (i + 1) : i;
>> +      inside_cost += vect_pow2 (tmp) * single_stmt_cost;
>> +    }
>> +
>> +  /* FORNOW: Assuming maximum 2 args per stmts.  */
>> +  for (i = 0; i < 2; i++)
>> +    {
>> +      if (dt[i] == vect_constant_def || dt[i] == vect_external_def)
>> +        outside_cost += vect_get_stmt_cost (vector_stmt);
>> +    }
>> +
>> +  if (vect_print_dump_info (REPORT_COST))
>> +    fprintf (vect_dump, "vect_model_promotion_demotion_cost: inside_cost = %d, "
>> +             "outside_cost = %d .", inside_cost, outside_cost);
>> +
>> +  /* Set the costs in STMT_INFO.  */
>> +  stmt_vinfo_set_inside_of_loop_cost (stmt_info, NULL, inside_cost);
>> +  stmt_vinfo_set_outside_of_loop_cost (stmt_info, NULL, outside_cost);
>> +}
>> +
>>  /* Function vect_cost_strided_group_size
>>
>>     For strided load or store, return the group_size only if it is the first
>> @@ -691,7 +731,7 @@ vect_model_store_cost (stmt_vec_info stmt_info, in
>>      {
>>        /* Uses a high and low interleave operation for each needed permute.  */
>>        inside_cost = ncopies * exact_log2(group_size) * group_size
>> -        * vect_get_stmt_cost (vector_stmt);
>> +        * vect_get_stmt_cost (vec_perm);
>>
>>        if (vect_print_dump_info (REPORT_COST))
>>          fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
>> @@ -795,7 +835,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int
>>      {
>>        /* Uses an even and odd extract operations for each needed permute.  */
>>        inside_cost = ncopies * exact_log2(group_size) * group_size
>> -     * vect_get_stmt_cost (vector_stmt);
>> +     * vect_get_stmt_cost (vec_perm);
>>
>>        if (vect_print_dump_info (REPORT_COST))
>>          fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
>> @@ -855,7 +895,7 @@ vect_get_load_cost (struct data_reference *dr, int
>>      case dr_explicit_realign:
>>        {
>>          *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load)
>> -           + vect_get_stmt_cost (vector_stmt));
>> +                                + vect_get_stmt_cost (vec_perm));
>>
>>          /* FIXME: If the misalignment remains fixed across the iterations of
>>             the containing loop, the following cost should be added to the
>> @@ -863,6 +903,9 @@ vect_get_load_cost (struct data_reference *dr, int
>>          if (targetm.vectorize.builtin_mask_for_load)
>>            *inside_cost += vect_get_stmt_cost (vector_stmt);
>>
>> +        if (vect_print_dump_info (REPORT_COST))
>> +          fprintf (vect_dump, "vect_model_load_cost: explicit realign");
>> +
>>          break;
>>        }
>>      case dr_explicit_realign_optimized:
>> @@ -886,7 +929,12 @@ vect_get_load_cost (struct data_reference *dr, int
>>            }
>>
>>          *inside_cost += ncopies * (vect_get_stmt_cost (vector_load)
>> -          + vect_get_stmt_cost (vector_stmt));
>> +                                + vect_get_stmt_cost (vec_perm));
>> +
>> +        if (vect_print_dump_info (REPORT_COST))
>> +          fprintf (vect_dump,
>> +                "vect_model_load_cost: explicit realign optimized");
>> +
>>          break;
>>        }
>>
>> @@ -2919,7 +2967,7 @@ vectorizable_type_demotion (gimple stmt, gimple_st
>>        STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
>>        if (vect_print_dump_info (REPORT_DETAILS))
>>          fprintf (vect_dump, "=== vectorizable_demotion ===");
>> -      vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
>> +      vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt);
>>        return true;
>>      }
>>
>> @@ -3217,7 +3265,7 @@ vectorizable_type_promotion (gimple stmt, gimple_s
>>        STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
>>        if (vect_print_dump_info (REPORT_DETAILS))
>>          fprintf (vect_dump, "=== vectorizable_promotion ===");
>> -      vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
>> +      vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt);
>>        return true;
>>      }
>>
>> Index: gcc/config/spu/spu.c
>> ===================================================================
>> --- gcc/config/spu/spu.c      (revision 184047)
>> +++ gcc/config/spu/spu.c      (working copy)
>> @@ -6794,6 +6794,7 @@ spu_builtin_vectorization_cost (enum vect_cost_for
>>        case scalar_to_vec:
>>        case cond_branch_not_taken:
>>        case vec_perm:
>> +      case vec_promote_demote:
>>          return 1;
>>
>>        case scalar_store:
>> Index: gcc/config/i386/i386.c
>> ===================================================================
>> --- gcc/config/i386/i386.c    (revision 184047)
>> +++ gcc/config/i386/i386.c    (working copy)
>> @@ -32816,7 +32816,8 @@ ix86_builtin_vectorization_cost (enum vect_cost_fo
>>          return ix86_cost->cond_not_taken_branch_cost;
>>
>>        case vec_perm:
>> -        return 1;
>> +      case vec_promote_demote:
>> +        return ix86_cost->vec_stmt_cost;
>>
>>        default:
>>          gcc_unreachable ();
>> Index: gcc/config/rs6000/rs6000.c
>> ===================================================================
>> --- gcc/config/rs6000/rs6000.c        (revision 184047)
>> +++ gcc/config/rs6000/rs6000.c        (working copy)
>> @@ -3695,12 +3695,23 @@ rs6000_builtin_vectorization_cost (enum vect_cost_
>>        case vec_to_scalar:
>>        case scalar_to_vec:
>>        case cond_branch_not_taken:
>> -      case vec_perm:
>>          return 1;
>>
>>        case cond_branch_taken:
>>          return 3;
>>
>> +      case vec_perm:
>> +     if (TARGET_VSX)
>> +       return 4;
>> +     else
>> +       return 1;
>> +
>> +      case vec_promote_demote:
>> +     if (TARGET_VSX)
>> +       return 5;
>> +     else
>> +       return 1;
>> +
>>        case unaligned_load:
>>          if (TARGET_VSX && TARGET_ALLOW_MOVMISALIGN)
>>            {
>>
>

Patch

Index: gcc/targhooks.c
===================================================================
--- gcc/targhooks.c	(revision 184047)
+++ gcc/targhooks.c	(working copy)
@@ -529,6 +529,7 @@  default_builtin_vectorization_cost (enum vect_cost
       case scalar_to_vec:
       case cond_branch_not_taken:
       case vec_perm:
+      case vec_promote_demote:
         return 1;
 
       case unaligned_load:
Index: gcc/target.h
===================================================================
--- gcc/target.h	(revision 184047)
+++ gcc/target.h	(working copy)
@@ -128,7 +128,8 @@  enum vect_cost_for_stmt
   scalar_to_vec,
   cond_branch_not_taken,
   cond_branch_taken,
-  vec_perm
+  vec_perm,
+  vec_promote_demote
 };
 
 /* Sets of optimization levels at which an option may be enabled by
Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c	(revision 184047)
+++ gcc/tree-vect-loop.c	(working copy)
@@ -2104,7 +2104,8 @@  vect_get_single_scalar_iteraion_cost (loop_vec_inf
           if (stmt_info
               && !STMT_VINFO_RELEVANT_P (stmt_info)
               && (!STMT_VINFO_LIVE_P (stmt_info)
-                  || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
+                  || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
+	      && !STMT_VINFO_IN_PATTERN_P (stmt_info))
             continue;
 
           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
@@ -2251,11 +2252,19 @@  vect_estimate_min_profitable_iters (loop_vec_info
 	{
 	  gimple stmt = gsi_stmt (si);
 	  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+
+	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
+	    {
+	      stmt = STMT_VINFO_RELATED_STMT (stmt_info);
+	      stmt_info = vinfo_for_stmt (stmt);
+	    }
+
 	  /* Skip stmts that are not vectorized inside the loop.  */
 	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
 	      && (!STMT_VINFO_LIVE_P (stmt_info)
-		  || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
+		  || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))))
 	    continue;
+
 	  vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
 	  /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
 	     some of the "outside" costs are generated inside the outer-loop.  */
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c	(revision 184047)
+++ gcc/tree-vect-stmts.c	(working copy)
@@ -623,6 +623,46 @@  vect_model_simple_cost (stmt_vec_info stmt_info, i
 }
 
 
+/* Model cost for type demotion and promotion operations.  PWR is normally
+   zero for single-step promotions and demotions.  It will be one if 
+   two-step promotion/demotion is required, and so on.  Each additional
+   step doubles the number of instructions required.  */
+
+static void
+vect_model_promotion_demotion_cost (stmt_vec_info stmt_info,
+				    enum vect_def_type *dt, int pwr)
+{
+  int i, tmp;
+  int inside_cost = 0, outside_cost = 0, single_stmt_cost;
+
+  /* The SLP costs were already calculated during SLP tree build.  */
+  if (PURE_SLP_STMT (stmt_info))
+    return;
+
+  single_stmt_cost = vect_get_stmt_cost (vec_promote_demote);
+  for (i = 0; i < pwr + 1; i++)
+    {
+      tmp = (STMT_VINFO_TYPE (stmt_info) == type_promotion_vec_info_type) ?
+	(i + 1) : i;
+      inside_cost += vect_pow2 (tmp) * single_stmt_cost;
+    }
+
+  /* FORNOW: Assuming maximum 2 args per stmts.  */
+  for (i = 0; i < 2; i++)
+    {
+      if (dt[i] == vect_constant_def || dt[i] == vect_external_def)
+        outside_cost += vect_get_stmt_cost (vector_stmt);
+    }
+
+  if (vect_print_dump_info (REPORT_COST))
+    fprintf (vect_dump, "vect_model_promotion_demotion_cost: inside_cost = %d, "
+             "outside_cost = %d .", inside_cost, outside_cost);
+
+  /* Set the costs in STMT_INFO.  */
+  stmt_vinfo_set_inside_of_loop_cost (stmt_info, NULL, inside_cost);
+  stmt_vinfo_set_outside_of_loop_cost (stmt_info, NULL, outside_cost);
+}
+
 /* Function vect_cost_strided_group_size
 
    For strided load or store, return the group_size only if it is the first
@@ -691,7 +731,7 @@  vect_model_store_cost (stmt_vec_info stmt_info, in
     {
       /* Uses a high and low interleave operation for each needed permute.  */
       inside_cost = ncopies * exact_log2(group_size) * group_size
-        * vect_get_stmt_cost (vector_stmt);
+        * vect_get_stmt_cost (vec_perm);
 
       if (vect_print_dump_info (REPORT_COST))
         fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .",
@@ -795,7 +835,7 @@  vect_model_load_cost (stmt_vec_info stmt_info, int
     {
       /* Uses an even and odd extract operations for each needed permute.  */
       inside_cost = ncopies * exact_log2(group_size) * group_size
-	* vect_get_stmt_cost (vector_stmt);
+	* vect_get_stmt_cost (vec_perm);
 
       if (vect_print_dump_info (REPORT_COST))
         fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .",
@@ -855,7 +895,7 @@  vect_get_load_cost (struct data_reference *dr, int
     case dr_explicit_realign:
       {
         *inside_cost += ncopies * (2 * vect_get_stmt_cost (vector_load)
-           + vect_get_stmt_cost (vector_stmt));
+				   + vect_get_stmt_cost (vec_perm));
 
         /* FIXME: If the misalignment remains fixed across the iterations of
            the containing loop, the following cost should be added to the
@@ -863,6 +903,9 @@  vect_get_load_cost (struct data_reference *dr, int
         if (targetm.vectorize.builtin_mask_for_load)
           *inside_cost += vect_get_stmt_cost (vector_stmt);
 
+        if (vect_print_dump_info (REPORT_COST))
+          fprintf (vect_dump, "vect_model_load_cost: explicit realign");
+
         break;
       }
     case dr_explicit_realign_optimized:
@@ -886,7 +929,12 @@  vect_get_load_cost (struct data_reference *dr, int
           }
 
         *inside_cost += ncopies * (vect_get_stmt_cost (vector_load)
-          + vect_get_stmt_cost (vector_stmt));
+				   + vect_get_stmt_cost (vec_perm));
+
+        if (vect_print_dump_info (REPORT_COST))
+          fprintf (vect_dump,
+		   "vect_model_load_cost: explicit realign optimized");
+
         break;
       }
 
@@ -2919,7 +2967,7 @@  vectorizable_type_demotion (gimple stmt, gimple_st
       STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type;
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "=== vectorizable_demotion ===");
-      vect_model_simple_cost (stmt_info, ncopies, dt, NULL);
+      vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt);
       return true;
     }
 
@@ -3217,7 +3265,7 @@  vectorizable_type_promotion (gimple stmt, gimple_s
       STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type;
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "=== vectorizable_promotion ===");
-      vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL);
+      vect_model_promotion_demotion_cost (stmt_info, dt, multi_step_cvt);
       return true;
     }
 
Index: gcc/config/spu/spu.c
===================================================================
--- gcc/config/spu/spu.c	(revision 184047)
+++ gcc/config/spu/spu.c	(working copy)
@@ -6794,6 +6794,7 @@  spu_builtin_vectorization_cost (enum vect_cost_for
       case scalar_to_vec:
       case cond_branch_not_taken:
       case vec_perm:
+      case vec_promote_demote:
         return 1;
 
       case scalar_store:
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c	(revision 184047)
+++ gcc/config/i386/i386.c	(working copy)
@@ -32816,7 +32816,8 @@  ix86_builtin_vectorization_cost (enum vect_cost_fo
         return ix86_cost->cond_not_taken_branch_cost;
 
       case vec_perm:
-        return 1;
+      case vec_promote_demote:
+        return ix86_cost->vec_stmt_cost;
 
       default:
         gcc_unreachable ();
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 184047)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -3695,12 +3695,23 @@  rs6000_builtin_vectorization_cost (enum vect_cost_
       case vec_to_scalar:
       case scalar_to_vec:
       case cond_branch_not_taken:
-      case vec_perm:
         return 1;
 
       case cond_branch_taken:
         return 3;
 
+      case vec_perm:
+	if (TARGET_VSX)
+	  return 4;
+	else
+	  return 1;
+
+      case vec_promote_demote:
+	if (TARGET_VSX)
+	  return 5;
+	else
+	  return 1;
+
       case unaligned_load:
         if (TARGET_VSX && TARGET_ALLOW_MOVMISALIGN)
           {