diff mbox

[vec-tails,08/10] Support loop epilogue masking and low trip count loop vectorization

Message ID 20160519194604.GI40563@msticlxl57.ims.intel.com
State New
Headers show

Commit Message

Ilya Enkovich May 19, 2016, 7:46 p.m. UTC
Hi,

This patch enables vectorization of loop epilogues and low trip count
loops using masking.

Thanks,
Ilya
--
gcc/

2016-05-19  Ilya Enkovich  <ilya.enkovich@intel.com>

	* dbgcnt.def (vect_tail_mask): New.
	* tree-vect-loop.c (vect_analyze_loop_2): Support masked loop
	epilogues and low trip count loops.
	(vect_get_known_peeling_cost): Ignore scalat epilogue cost for
	loops we are going to mask.
	(vect_estimate_min_profitable_iters): Support masked loop
	epilogues and low trip count loops.
	* tree-vectorizer.c (vectorize_loops): Add a message for a case
	when loop epilogue can't be vectorized.

Comments

Richard Biener June 15, 2016, noon UTC | #1
On Thu, May 19, 2016 at 9:46 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
> Hi,
>
> This patch enables vectorization of loop epilogues and low trip count
> loops using masking.

I wonder why we have the epilogue masking restriction with respect to
the original vectorization factor - shouldn't this simply be handled by
vectorizing the epilogue?  First trying the original VF (requires masking
and is equivalent to low-tripcount loop vectorization), then if that is not
profitable iterate to smaller VFs?   [yes, ideally we'd be able to compare
cost for vectorization with different VFs and choose the best VF]

Thanks,
Richard.

> Thanks,
> Ilya
> --
> gcc/
>
> 2016-05-19  Ilya Enkovich  <ilya.enkovich@intel.com>
>
>         * dbgcnt.def (vect_tail_mask): New.
>         * tree-vect-loop.c (vect_analyze_loop_2): Support masked loop
>         epilogues and low trip count loops.
>         (vect_get_known_peeling_cost): Ignore scalat epilogue cost for
>         loops we are going to mask.
>         (vect_estimate_min_profitable_iters): Support masked loop
>         epilogues and low trip count loops.
>         * tree-vectorizer.c (vectorize_loops): Add a message for a case
>         when loop epilogue can't be vectorized.
>
>
> diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
> index 73c2966..5aad1d7 100644
> --- a/gcc/dbgcnt.def
> +++ b/gcc/dbgcnt.def
> @@ -193,4 +193,5 @@ DEBUG_COUNTER (tree_sra)
>  DEBUG_COUNTER (vect_loop)
>  DEBUG_COUNTER (vect_slp)
>  DEBUG_COUNTER (vect_tail_combine)
> +DEBUG_COUNTER (vect_tail_mask)
>  DEBUG_COUNTER (dom_unreachable_edges)
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 1a80c42..7075f29 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -2199,7 +2199,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
>    int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>    HOST_WIDE_INT estimated_niter;
>    unsigned th;
> -  int min_scalar_loop_bound;
> +  int min_scalar_loop_bound = 0;
>
>    /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
>    ok = vect_analyze_slp (loop_vinfo, n_stmts);
> @@ -2224,6 +2224,30 @@ start_over:
>    unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>    gcc_assert (vectorization_factor != 0);
>
> +  /* For now we mask loop epilogue using the same VF since it was used
> +     for cost estimations and it should be easier for reduction
> +     optimization.  */
> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +      && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
> +      && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) != (int)vectorization_factor)
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "not vectorized: VF for loop epilogue doesn't "
> +                        "match original loop VF.\n");
> +      return false;
> +    }
> +
> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +      && !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
> +      && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) <= (int)vectorization_factor)
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "not vectorized: VF for loop epilogue is too small\n");
> +      return false;
> +    }
> +
>    if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location,
>                      "vectorization_factor = %d, niters = "
> @@ -2237,11 +2261,29 @@ start_over:
>        || (max_niter != -1
>           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
>      {
> -      if (dump_enabled_p ())
> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                        "not vectorized: iteration count smaller than "
> -                        "vectorization factor.\n");
> -      return false;
> +      /* Allow low trip count for loop epilogue we want to mask.  */
> +      if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +         && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
> +       ;
> +      /* Allow low trip count for non-epilogue loops if flag is enabled.  */
> +      else if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +              && flag_tree_vectorize_short_loops)
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "iteration count is small, masking is "
> +                            "required for chosen vectorization factor.\n");
> +
> +         LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
> +       }
> +      else
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "not vectorized: iteration count smaller than "
> +                            "vectorization factor.\n");
> +         return false;
> +       }
>      }
>
>    /* Analyze the alignment of the data-refs in the loop.
> @@ -2282,6 +2324,16 @@ start_over:
>        return false;
>      }
>
> +  LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = true;
> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +      && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "vectorizing loop epilogue with masking.\n");
> +      LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
> +    }
> +
>    if (slp)
>      {
>        /* Analyze operations in the SLP instances.  Note this may
> @@ -2305,6 +2357,19 @@ start_over:
>        return false;
>      }
>
> +  if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
> +      && !LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
> +    {
> +      gcc_assert (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +                 || !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo));
> +
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "not vectorized: loop cannot be masked.\n");
> +
> +      return false;
> +    }
> +
>    /* Analyze cost.  Decide if worth while to vectorize.  */
>    int min_profitable_estimate, min_profitable_iters;
>    int min_profitable_combine_iters;
> @@ -2324,8 +2389,9 @@ start_over:
>        goto again;
>      }
>
> -  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
> -                           * vectorization_factor) - 1);
> +  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
> +    min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
> +                             * vectorization_factor) - 1);
>
>    /* Use the cost model only if it is more conservative than user specified
>       threshold.  */
> @@ -2425,18 +2491,28 @@ start_over:
>    else if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
>            && min_profitable_combine_iters >= 0)
>      {
> -      if (((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> -           && (LOOP_VINFO_INT_NITERS (loop_vinfo)
> -               >= (unsigned) min_profitable_combine_iters))
> +      if ((LOOP_VINFO_NEED_MASKING (loop_vinfo)
> +          || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> +              && (LOOP_VINFO_INT_NITERS (loop_vinfo)
> +                  >= (unsigned) min_profitable_combine_iters))
>            || estimated_niter == -1
>            || estimated_niter >= min_profitable_combine_iters)
> -         && dbg_cnt (vect_tail_combine))
> +         && (LOOP_VINFO_NEED_MASKING (loop_vinfo)
> +             || dbg_cnt (vect_tail_combine)))
>         {
>           LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
>           LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo) = true;
>
> -         dump_printf_loc (MSG_NOTE, vect_location,
> -                          "Decided to combine loop with its epilogue.\n");
> +          if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) && dump_enabled_p ())
> +           {
> +             if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
> +               dump_printf_loc (MSG_NOTE, vect_location,
> +                                "Decided to vectorize low trip count loop "
> +                                "with masking.\n");
> +             else
> +               dump_printf_loc (MSG_NOTE, vect_location,
> +                                "Decided to combine loop with its epilogue.\n");
> +           }
>
>           /* We need to adjust profitability check if combine
>              epilogue considering additional vector iteration
> @@ -2463,6 +2539,22 @@ start_over:
>         }
>      }
>
> +  /* Check for not profitable low trip count loop vectorization.  */
> +  if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
> +      && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +      && !LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo))
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_NOTE, vect_location,
> +                        "not vectorized: low trip count loop "
> +                        "vectorization is not profitable.\n");
> +      return false;
> +    }
> +
> +  if (LOOP_VINFO_MASK_EPILOGUE (loop_vinfo)
> +      && !dbg_cnt (vect_tail_mask))
> +    LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
> +
>    /* Ok to vectorize!  */
>    return true;
>
> @@ -3413,7 +3505,7 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
>                                   si->count * peel_iters_prologue,
>                                   si->kind, NULL, si->misalign,
>                                   vect_prologue);
> -  if (*peel_iters_epilogue)
> +  if (*peel_iters_epilogue && !LOOP_VINFO_NEED_MASKING (loop_vinfo))
>      FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
>        retval += record_stmt_cost (epilogue_cost_vec,
>                                   si->count * *peel_iters_epilogue,
> @@ -3451,12 +3543,50 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>    int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
>    void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
>
> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
> +    {
> +      /* Currently we don't produce scalar epilogue version in case
> +        its masked version is provided.  It means we don't need to
> +        compute profitability one more time here.  Just make a
> +        masked loop version.  */
> +      if (LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
> +       {
> +         gcc_assert (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo));
> +
> +         dump_printf_loc (MSG_NOTE, vect_location,
> +                          "cost model: mask loop epilogue.\n");
> +
> +         *ret_min_profitable_niters = 0;
> +         *ret_min_profitable_estimate = 0;
> +         *ret_min_profitable_combine_niters = 0;
> +         return;
> +       }
> +      else if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
> +       {
> +         dump_printf_loc (MSG_NOTE, vect_location,
> +                          "cost model disabled for epilogue.\n");
> +         *ret_min_profitable_niters = 0;
> +         *ret_min_profitable_estimate = 0;
> +         return;
> +       }
> +    }
>    /* Cost model disabled.  */
> -  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
> +  else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>      {
>        dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
>        *ret_min_profitable_niters = 0;
>        *ret_min_profitable_estimate = 0;
> +      *ret_min_profitable_combine_niters = -1;
> +
> +      if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
> +       *ret_min_profitable_combine_niters = 0;
> +      else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
> +              && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
> +       LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
> +      else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
> +              && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
> +       *ret_min_profitable_combine_niters = 0;
> +
>        return;
>      }
>
> @@ -3544,10 +3674,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>                                 si->count * peel_iters_prologue,
>                                 si->kind, stmt_info, si->misalign,
>                                 vect_prologue);
> -         (void) add_stmt_cost (target_cost_data,
> -                               si->count * peel_iters_epilogue,
> -                               si->kind, stmt_info, si->misalign,
> -                               vect_epilogue);
> +         /* We shouldn't add scalar epilogue cost for low trip
> +            count loops which are masked and have no epilogue.  */
> +         if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
> +           (void) add_stmt_cost (target_cost_data,
> +                                 si->count * peel_iters_epilogue,
> +                                 si->kind, stmt_info, si->misalign,
> +                                 vect_epilogue);
>         }
>      }
>    else
> @@ -3744,8 +3877,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>                "  Calculated minimum iters for profitability: %d\n",
>                min_profitable_iters);
>
> -  min_profitable_iters =
> -       min_profitable_iters < vf ? vf : min_profitable_iters;
> +  /* Adjust to VF for non-masked loops.  */
> +  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
> +    min_profitable_iters = MAX (min_profitable_iters, vf);
>
>    /* Because the condition we create is:
>       if (niters <= min_profitable_iters)
> @@ -3787,6 +3921,25 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>
>    *ret_min_profitable_combine_niters = -1;
>
> +  /* Handle low trip count loops.  */
> +  if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
> +    {
> +      /* Masked iteration should be better than a scalar loop:
> +        MIC + VIC + MOC < SIC * epilogue_niters  */
> +      if ((int)(masking_inside_cost + masking_prologue_cost + vec_inside_cost)
> +         >= (scalar_single_iter_cost * peel_iters_epilogue))
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_NOTE, vect_location,
> +                            "Low trip count loop vectorization is not "
> +                            "profitable.\n");
> +         return;
> +       }
> +
> +      *ret_min_profitable_combine_niters = 0;
> +      return;
> +    }
> +
>    /* Don't try to vectorize epilogue of epilogue.  */
>    if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
>      return;
> @@ -3795,7 +3948,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>      {
>        if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
>         {
> -         if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
> +         if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
> +           LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
> +         else if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
>             *ret_min_profitable_combine_niters = 0;
>           return;
>         }
> @@ -3854,6 +4009,29 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>                              profitable_iters);
>           *ret_min_profitable_combine_niters = profitable_iters;
>         }
> +
> +      if (!(flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK))
> +       return;
> +
> +      /* Now compute profitability for loop epilogue masking.
> +        The following condition must hold true:
> +        SIC * epilogue_niters + SOC > VIC + MIC + MPC  */
> +      int min_profitable_masking_niters
> +       = (vec_inside_cost + masking_inside_cost + masking_prologue_cost
> +          - scalar_outside_cost) / scalar_single_iter_cost;
> +      if (min_profitable_masking_niters > peel_iters_epilogue)
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_NOTE, vect_location,
> +                            "Loop epilogue masking is not pofitable.\n");
> +       }
> +      else
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_NOTE, vect_location,
> +                            "Loop epilogue masking is pofitable.\n");
> +         LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
> +       }
>      }
>  }
>
> diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
> index 5f15246..f70aed6 100644
> --- a/gcc/tree-vectorizer.c
> +++ b/gcc/tree-vectorizer.c
> @@ -539,7 +539,16 @@ vectorize_loops (void)
>         loop->aux = loop_vinfo;
>
>         if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
> -         continue;
> +         {
> +           if (loop_vinfo
> +               && LOOP_VINFO_EPILOGUE_P (loop_vinfo)
> +               && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
> +               && dump_enabled_p ())
> +             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                              "loop epilogue can't be vectorized.\n");
> +
> +           continue;
> +         }
>
>          if (!dbg_cnt (vect_loop))
>           {
Jeff Law June 16, 2016, 3:45 p.m. UTC | #2
On 05/19/2016 01:46 PM, Ilya Enkovich wrote:
> Hi,
>
> This patch enables vectorization of loop epilogues and low trip count
> loops using masking.
>
> Thanks,
> Ilya
> --
> gcc/
>
> 2016-05-19  Ilya Enkovich  <ilya.enkovich@intel.com>
>
> 	* dbgcnt.def (vect_tail_mask): New.
> 	* tree-vect-loop.c (vect_analyze_loop_2): Support masked loop
> 	epilogues and low trip count loops.
> 	(vect_get_known_peeling_cost): Ignore scalat epilogue cost for
s/scalat/scalar/

> 	loops we are going to mask.
> 	(vect_estimate_min_profitable_iters): Support masked loop
> 	epilogues and low trip count loops.
> 	* tree-vectorizer.c (vectorize_loops): Add a message for a case
> 	when loop epilogue can't be vectorized.
>
I don't see anything here that worries me.  Richi's question is a valid 
one, but I don't have a strong opinion on whether or not that should be 
explored as a prerequisite for this work to be accepted or if it should 
be a follow-up item.  So take guidance from Richi on that.

jeff
Ilya Enkovich June 16, 2016, 3:52 p.m. UTC | #3
2016-06-15 15:00 GMT+03:00 Richard Biener <richard.guenther@gmail.com>:
> On Thu, May 19, 2016 at 9:46 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
>> Hi,
>>
>> This patch enables vectorization of loop epilogues and low trip count
>> loops using masking.
>
> I wonder why we have the epilogue masking restriction with respect to
> the original vectorization factor - shouldn't this simply be handled by
> vectorizing the epilogue?  First trying the original VF (requires masking
> and is equivalent to low-tripcount loop vectorization), then if that is not
> profitable iterate to smaller VFs?   [yes, ideally we'd be able to compare
> cost for vectorization with different VFs and choose the best VF]

When main loop is vectorized using some VF we compute epilogue masking
profitability and generate epilogue to be vectorized and masked using exactly
the same VF.  In ideal case we never fail to vectorize epilogue because we
check that it can be masked.  Unfortunately we may loose some info
when generating
a loop copy (e.g. scev info is lost) and therefore may fail to
vectorize epilogue.

I expect that if we loose some info and thus fail to vectorize for a
specified VF
(for which the main loop was successfully vectorized) then we are going to fail
to vectorize for other vector sizes too.  Actually I'd prefer to try
the only vector
size for vectorization with masking to save compilation time.

Thanks,
Ilya

>
> Thanks,
> Richard.
>
>> Thanks,
>> Ilya
>> --
>> gcc/
>>
>> 2016-05-19  Ilya Enkovich  <ilya.enkovich@intel.com>
>>
>>         * dbgcnt.def (vect_tail_mask): New.
>>         * tree-vect-loop.c (vect_analyze_loop_2): Support masked loop
>>         epilogues and low trip count loops.
>>         (vect_get_known_peeling_cost): Ignore scalat epilogue cost for
>>         loops we are going to mask.
>>         (vect_estimate_min_profitable_iters): Support masked loop
>>         epilogues and low trip count loops.
>>         * tree-vectorizer.c (vectorize_loops): Add a message for a case
>>         when loop epilogue can't be vectorized.
>>
>>
>> diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
>> index 73c2966..5aad1d7 100644
>> --- a/gcc/dbgcnt.def
>> +++ b/gcc/dbgcnt.def
>> @@ -193,4 +193,5 @@ DEBUG_COUNTER (tree_sra)
>>  DEBUG_COUNTER (vect_loop)
>>  DEBUG_COUNTER (vect_slp)
>>  DEBUG_COUNTER (vect_tail_combine)
>> +DEBUG_COUNTER (vect_tail_mask)
>>  DEBUG_COUNTER (dom_unreachable_edges)
>> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
>> index 1a80c42..7075f29 100644
>> --- a/gcc/tree-vect-loop.c
>> +++ b/gcc/tree-vect-loop.c
>> @@ -2199,7 +2199,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
>>    int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>>    HOST_WIDE_INT estimated_niter;
>>    unsigned th;
>> -  int min_scalar_loop_bound;
>> +  int min_scalar_loop_bound = 0;
>>
>>    /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
>>    ok = vect_analyze_slp (loop_vinfo, n_stmts);
>> @@ -2224,6 +2224,30 @@ start_over:
>>    unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
>>    gcc_assert (vectorization_factor != 0);
>>
>> +  /* For now we mask loop epilogue using the same VF since it was used
>> +     for cost estimations and it should be easier for reduction
>> +     optimization.  */
>> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +      && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
>> +      && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) != (int)vectorization_factor)
>> +    {
>> +      if (dump_enabled_p ())
>> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                        "not vectorized: VF for loop epilogue doesn't "
>> +                        "match original loop VF.\n");
>> +      return false;
>> +    }
>> +
>> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +      && !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
>> +      && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) <= (int)vectorization_factor)
>> +    {
>> +      if (dump_enabled_p ())
>> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                        "not vectorized: VF for loop epilogue is too small\n");
>> +      return false;
>> +    }
>> +
>>    if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
>>      dump_printf_loc (MSG_NOTE, vect_location,
>>                      "vectorization_factor = %d, niters = "
>> @@ -2237,11 +2261,29 @@ start_over:
>>        || (max_niter != -1
>>           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
>>      {
>> -      if (dump_enabled_p ())
>> -       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> -                        "not vectorized: iteration count smaller than "
>> -                        "vectorization factor.\n");
>> -      return false;
>> +      /* Allow low trip count for loop epilogue we want to mask.  */
>> +      if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +         && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
>> +       ;
>> +      /* Allow low trip count for non-epilogue loops if flag is enabled.  */
>> +      else if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +              && flag_tree_vectorize_short_loops)
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                            "iteration count is small, masking is "
>> +                            "required for chosen vectorization factor.\n");
>> +
>> +         LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
>> +       }
>> +      else
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                            "not vectorized: iteration count smaller than "
>> +                            "vectorization factor.\n");
>> +         return false;
>> +       }
>>      }
>>
>>    /* Analyze the alignment of the data-refs in the loop.
>> @@ -2282,6 +2324,16 @@ start_over:
>>        return false;
>>      }
>>
>> +  LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = true;
>> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +      && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
>> +    {
>> +      if (dump_enabled_p ())
>> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                        "vectorizing loop epilogue with masking.\n");
>> +      LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
>> +    }
>> +
>>    if (slp)
>>      {
>>        /* Analyze operations in the SLP instances.  Note this may
>> @@ -2305,6 +2357,19 @@ start_over:
>>        return false;
>>      }
>>
>> +  if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
>> +      && !LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>> +    {
>> +      gcc_assert (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +                 || !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo));
>> +
>> +      if (dump_enabled_p ())
>> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                        "not vectorized: loop cannot be masked.\n");
>> +
>> +      return false;
>> +    }
>> +
>>    /* Analyze cost.  Decide if worth while to vectorize.  */
>>    int min_profitable_estimate, min_profitable_iters;
>>    int min_profitable_combine_iters;
>> @@ -2324,8 +2389,9 @@ start_over:
>>        goto again;
>>      }
>>
>> -  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
>> -                           * vectorization_factor) - 1);
>> +  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
>> +    min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
>> +                             * vectorization_factor) - 1);
>>
>>    /* Use the cost model only if it is more conservative than user specified
>>       threshold.  */
>> @@ -2425,18 +2491,28 @@ start_over:
>>    else if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
>>            && min_profitable_combine_iters >= 0)
>>      {
>> -      if (((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
>> -           && (LOOP_VINFO_INT_NITERS (loop_vinfo)
>> -               >= (unsigned) min_profitable_combine_iters))
>> +      if ((LOOP_VINFO_NEED_MASKING (loop_vinfo)
>> +          || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
>> +              && (LOOP_VINFO_INT_NITERS (loop_vinfo)
>> +                  >= (unsigned) min_profitable_combine_iters))
>>            || estimated_niter == -1
>>            || estimated_niter >= min_profitable_combine_iters)
>> -         && dbg_cnt (vect_tail_combine))
>> +         && (LOOP_VINFO_NEED_MASKING (loop_vinfo)
>> +             || dbg_cnt (vect_tail_combine)))
>>         {
>>           LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
>>           LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo) = true;
>>
>> -         dump_printf_loc (MSG_NOTE, vect_location,
>> -                          "Decided to combine loop with its epilogue.\n");
>> +          if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) && dump_enabled_p ())
>> +           {
>> +             if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
>> +               dump_printf_loc (MSG_NOTE, vect_location,
>> +                                "Decided to vectorize low trip count loop "
>> +                                "with masking.\n");
>> +             else
>> +               dump_printf_loc (MSG_NOTE, vect_location,
>> +                                "Decided to combine loop with its epilogue.\n");
>> +           }
>>
>>           /* We need to adjust profitability check if combine
>>              epilogue considering additional vector iteration
>> @@ -2463,6 +2539,22 @@ start_over:
>>         }
>>      }
>>
>> +  /* Check for not profitable low trip count loop vectorization.  */
>> +  if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
>> +      && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +      && !LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo))
>> +    {
>> +      if (dump_enabled_p ())
>> +       dump_printf_loc (MSG_NOTE, vect_location,
>> +                        "not vectorized: low trip count loop "
>> +                        "vectorization is not profitable.\n");
>> +      return false;
>> +    }
>> +
>> +  if (LOOP_VINFO_MASK_EPILOGUE (loop_vinfo)
>> +      && !dbg_cnt (vect_tail_mask))
>> +    LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
>> +
>>    /* Ok to vectorize!  */
>>    return true;
>>
>> @@ -3413,7 +3505,7 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
>>                                   si->count * peel_iters_prologue,
>>                                   si->kind, NULL, si->misalign,
>>                                   vect_prologue);
>> -  if (*peel_iters_epilogue)
>> +  if (*peel_iters_epilogue && !LOOP_VINFO_NEED_MASKING (loop_vinfo))
>>      FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
>>        retval += record_stmt_cost (epilogue_cost_vec,
>>                                   si->count * *peel_iters_epilogue,
>> @@ -3451,12 +3543,50 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>>    int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
>>    void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
>>
>> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
>> +    {
>> +      /* Currently we don't produce scalar epilogue version in case
>> +        its masked version is provided.  It means we don't need to
>> +        compute profitability one more time here.  Just make a
>> +        masked loop version.  */
>> +      if (LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
>> +       {
>> +         gcc_assert (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo));
>> +
>> +         dump_printf_loc (MSG_NOTE, vect_location,
>> +                          "cost model: mask loop epilogue.\n");
>> +
>> +         *ret_min_profitable_niters = 0;
>> +         *ret_min_profitable_estimate = 0;
>> +         *ret_min_profitable_combine_niters = 0;
>> +         return;
>> +       }
>> +      else if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
>> +       {
>> +         dump_printf_loc (MSG_NOTE, vect_location,
>> +                          "cost model disabled for epilogue.\n");
>> +         *ret_min_profitable_niters = 0;
>> +         *ret_min_profitable_estimate = 0;
>> +         return;
>> +       }
>> +    }
>>    /* Cost model disabled.  */
>> -  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>> +  else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
>>      {
>>        dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
>>        *ret_min_profitable_niters = 0;
>>        *ret_min_profitable_estimate = 0;
>> +      *ret_min_profitable_combine_niters = -1;
>> +
>> +      if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
>> +       *ret_min_profitable_combine_niters = 0;
>> +      else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
>> +              && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>> +       LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
>> +      else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
>> +              && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
>> +       *ret_min_profitable_combine_niters = 0;
>> +
>>        return;
>>      }
>>
>> @@ -3544,10 +3674,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>>                                 si->count * peel_iters_prologue,
>>                                 si->kind, stmt_info, si->misalign,
>>                                 vect_prologue);
>> -         (void) add_stmt_cost (target_cost_data,
>> -                               si->count * peel_iters_epilogue,
>> -                               si->kind, stmt_info, si->misalign,
>> -                               vect_epilogue);
>> +         /* We shouldn't add scalar epilogue cost for low trip
>> +            count loops which are masked and have no epilogue.  */
>> +         if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
>> +           (void) add_stmt_cost (target_cost_data,
>> +                                 si->count * peel_iters_epilogue,
>> +                                 si->kind, stmt_info, si->misalign,
>> +                                 vect_epilogue);
>>         }
>>      }
>>    else
>> @@ -3744,8 +3877,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>>                "  Calculated minimum iters for profitability: %d\n",
>>                min_profitable_iters);
>>
>> -  min_profitable_iters =
>> -       min_profitable_iters < vf ? vf : min_profitable_iters;
>> +  /* Adjust to VF for non-masked loops.  */
>> +  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
>> +    min_profitable_iters = MAX (min_profitable_iters, vf);
>>
>>    /* Because the condition we create is:
>>       if (niters <= min_profitable_iters)
>> @@ -3787,6 +3921,25 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>>
>>    *ret_min_profitable_combine_niters = -1;
>>
>> +  /* Handle low trip count loops.  */
>> +  if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
>> +    {
>> +      /* Masked iteration should be better than a scalar loop:
>> +        MIC + VIC + MOC < SIC * epilogue_niters  */
>> +      if ((int)(masking_inside_cost + masking_prologue_cost + vec_inside_cost)
>> +         >= (scalar_single_iter_cost * peel_iters_epilogue))
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf_loc (MSG_NOTE, vect_location,
>> +                            "Low trip count loop vectorization is not "
>> +                            "profitable.\n");
>> +         return;
>> +       }
>> +
>> +      *ret_min_profitable_combine_niters = 0;
>> +      return;
>> +    }
>> +
>>    /* Don't try to vectorize epilogue of epilogue.  */
>>    if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
>>      return;
>> @@ -3795,7 +3948,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>>      {
>>        if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
>>         {
>> -         if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
>> +         if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
>> +           LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
>> +         else if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
>>             *ret_min_profitable_combine_niters = 0;
>>           return;
>>         }
>> @@ -3854,6 +4009,29 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
>>                              profitable_iters);
>>           *ret_min_profitable_combine_niters = profitable_iters;
>>         }
>> +
>> +      if (!(flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK))
>> +       return;
>> +
>> +      /* Now compute profitability for loop epilogue masking.
>> +        The following condition must hold true:
>> +        SIC * epilogue_niters + SOC > VIC + MIC + MPC  */
>> +      int min_profitable_masking_niters
>> +       = (vec_inside_cost + masking_inside_cost + masking_prologue_cost
>> +          - scalar_outside_cost) / scalar_single_iter_cost;
>> +      if (min_profitable_masking_niters > peel_iters_epilogue)
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf_loc (MSG_NOTE, vect_location,
>> +                            "Loop epilogue masking is not pofitable.\n");
>> +       }
>> +      else
>> +       {
>> +         if (dump_enabled_p ())
>> +           dump_printf_loc (MSG_NOTE, vect_location,
>> +                            "Loop epilogue masking is pofitable.\n");
>> +         LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
>> +       }
>>      }
>>  }
>>
>> diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
>> index 5f15246..f70aed6 100644
>> --- a/gcc/tree-vectorizer.c
>> +++ b/gcc/tree-vectorizer.c
>> @@ -539,7 +539,16 @@ vectorize_loops (void)
>>         loop->aux = loop_vinfo;
>>
>>         if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
>> -         continue;
>> +         {
>> +           if (loop_vinfo
>> +               && LOOP_VINFO_EPILOGUE_P (loop_vinfo)
>> +               && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
>> +               && dump_enabled_p ())
>> +             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> +                              "loop epilogue can't be vectorized.\n");
>> +
>> +           continue;
>> +         }
>>
>>          if (!dbg_cnt (vect_loop))
>>           {
diff mbox

Patch

diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
index 73c2966..5aad1d7 100644
--- a/gcc/dbgcnt.def
+++ b/gcc/dbgcnt.def
@@ -193,4 +193,5 @@  DEBUG_COUNTER (tree_sra)
 DEBUG_COUNTER (vect_loop)
 DEBUG_COUNTER (vect_slp)
 DEBUG_COUNTER (vect_tail_combine)
+DEBUG_COUNTER (vect_tail_mask)
 DEBUG_COUNTER (dom_unreachable_edges)
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 1a80c42..7075f29 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2199,7 +2199,7 @@  vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   HOST_WIDE_INT estimated_niter;
   unsigned th;
-  int min_scalar_loop_bound;
+  int min_scalar_loop_bound = 0;
 
   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
   ok = vect_analyze_slp (loop_vinfo, n_stmts);
@@ -2224,6 +2224,30 @@  start_over:
   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   gcc_assert (vectorization_factor != 0);
 
+  /* For now we mask loop epilogue using the same VF since it was used
+     for cost estimations and it should be easier for reduction
+     optimization.  */
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
+      && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) != (int)vectorization_factor)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "not vectorized: VF for loop epilogue doesn't "
+			 "match original loop VF.\n");
+      return false;
+    }
+
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
+      && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) <= (int)vectorization_factor)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "not vectorized: VF for loop epilogue is too small\n");
+      return false;
+    }
+
   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "vectorization_factor = %d, niters = "
@@ -2237,11 +2261,29 @@  start_over:
       || (max_niter != -1
 	  && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
     {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "not vectorized: iteration count smaller than "
-			 "vectorization factor.\n");
-      return false;
+      /* Allow low trip count for loop epilogue we want to mask.  */
+      if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+	  && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
+	;
+      /* Allow low trip count for non-epilogue loops if flag is enabled.  */
+      else if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+	       && flag_tree_vectorize_short_loops)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "iteration count is small, masking is "
+			     "required for chosen vectorization factor.\n");
+
+	  LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
+	}
+      else
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "not vectorized: iteration count smaller than "
+			     "vectorization factor.\n");
+	  return false;
+	}
     }
 
   /* Analyze the alignment of the data-refs in the loop.
@@ -2282,6 +2324,16 @@  start_over:
       return false;
     }
 
+  LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = true;
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "vectorizing loop epilogue with masking.\n");
+      LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
+    }
+
   if (slp)
     {
       /* Analyze operations in the SLP instances.  Note this may
@@ -2305,6 +2357,19 @@  start_over:
       return false;
     }
 
+  if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
+      && !LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+    {
+      gcc_assert (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+		  || !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo));
+
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "not vectorized: loop cannot be masked.\n");
+
+      return false;
+    }
+
   /* Analyze cost.  Decide if worth while to vectorize.  */
   int min_profitable_estimate, min_profitable_iters;
   int min_profitable_combine_iters;
@@ -2324,8 +2389,9 @@  start_over:
       goto again;
     }
 
-  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
-			    * vectorization_factor) - 1);
+  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
+    min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+			      * vectorization_factor) - 1);
 
   /* Use the cost model only if it is more conservative than user specified
      threshold.  */
@@ -2425,18 +2491,28 @@  start_over:
   else if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
 	   && min_profitable_combine_iters >= 0)
     {
-      if (((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	    && (LOOP_VINFO_INT_NITERS (loop_vinfo)
-		>= (unsigned) min_profitable_combine_iters))
+      if ((LOOP_VINFO_NEED_MASKING (loop_vinfo)
+	   || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	       && (LOOP_VINFO_INT_NITERS (loop_vinfo)
+		   >= (unsigned) min_profitable_combine_iters))
 	   || estimated_niter == -1
 	   || estimated_niter >= min_profitable_combine_iters)
-	  && dbg_cnt (vect_tail_combine))
+	  && (LOOP_VINFO_NEED_MASKING (loop_vinfo)
+	      || dbg_cnt (vect_tail_combine)))
 	{
 	  LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
 	  LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo) = true;
 
-	  dump_printf_loc (MSG_NOTE, vect_location,
-			   "Decided to combine loop with its epilogue.\n");
+          if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) && dump_enabled_p ())
+	    {
+	      if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "Decided to vectorize low trip count loop "
+				 "with masking.\n");
+	      else
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "Decided to combine loop with its epilogue.\n");
+	    }
 
 	  /* We need to adjust profitability check if combine
 	     epilogue considering additional vector iteration
@@ -2463,6 +2539,22 @@  start_over:
 	}
     }
 
+  /* Check for not profitable low trip count loop vectorization.  */
+  if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
+      && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && !LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "not vectorized: low trip count loop "
+			 "vectorization is not profitable.\n");
+      return false;
+    }
+
+  if (LOOP_VINFO_MASK_EPILOGUE (loop_vinfo)
+      && !dbg_cnt (vect_tail_mask))
+    LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
+
   /* Ok to vectorize!  */
   return true;
 
@@ -3413,7 +3505,7 @@  vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
 				  si->count * peel_iters_prologue,
 				  si->kind, NULL, si->misalign,
 				  vect_prologue);
-  if (*peel_iters_epilogue)
+  if (*peel_iters_epilogue && !LOOP_VINFO_NEED_MASKING (loop_vinfo))
     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
       retval += record_stmt_cost (epilogue_cost_vec,
 				  si->count * *peel_iters_epilogue,
@@ -3451,12 +3543,50 @@  vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
 
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+    {
+      /* Currently we don't produce scalar epilogue version in case
+	 its masked version is provided.  It means we don't need to
+	 compute profitability one more time here.  Just make a
+	 masked loop version.  */
+      if (LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
+	{
+	  gcc_assert (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo));
+
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "cost model: mask loop epilogue.\n");
+
+	  *ret_min_profitable_niters = 0;
+	  *ret_min_profitable_estimate = 0;
+	  *ret_min_profitable_combine_niters = 0;
+	  return;
+	}
+      else if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "cost model disabled for epilogue.\n");
+	  *ret_min_profitable_niters = 0;
+	  *ret_min_profitable_estimate = 0;
+	  return;
+	}
+    }
   /* Cost model disabled.  */
-  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
+  else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
     {
       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
       *ret_min_profitable_niters = 0;
       *ret_min_profitable_estimate = 0;
+      *ret_min_profitable_combine_niters = -1;
+
+      if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
+	*ret_min_profitable_combine_niters = 0;
+      else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
+	       && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+	LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
+      else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
+	       && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+	*ret_min_profitable_combine_niters = 0;
+
       return;
     }
 
@@ -3544,10 +3674,13 @@  vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 				si->count * peel_iters_prologue,
 				si->kind, stmt_info, si->misalign,
 				vect_prologue);
-	  (void) add_stmt_cost (target_cost_data,
-				si->count * peel_iters_epilogue,
-				si->kind, stmt_info, si->misalign,
-				vect_epilogue);
+	  /* We shouldn't add scalar epilogue cost for low trip
+	     count loops which are masked and have no epilogue.  */
+	  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
+	    (void) add_stmt_cost (target_cost_data,
+				  si->count * peel_iters_epilogue,
+				  si->kind, stmt_info, si->misalign,
+				  vect_epilogue);
 	}
     }
   else
@@ -3744,8 +3877,9 @@  vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 	       "  Calculated minimum iters for profitability: %d\n",
 	       min_profitable_iters);
 
-  min_profitable_iters =
-	min_profitable_iters < vf ? vf : min_profitable_iters;
+  /* Adjust to VF for non-masked loops.  */
+  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
+    min_profitable_iters = MAX (min_profitable_iters, vf);
 
   /* Because the condition we create is:
      if (niters <= min_profitable_iters)
@@ -3787,6 +3921,25 @@  vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 
   *ret_min_profitable_combine_niters = -1;
 
+  /* Handle low trip count loops.  */
+  if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
+    {
+      /* Masked iteration should be better than a scalar loop:
+	 MIC + VIC + MOC < SIC * epilogue_niters  */
+      if ((int)(masking_inside_cost + masking_prologue_cost + vec_inside_cost)
+	  >= (scalar_single_iter_cost * peel_iters_epilogue))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Low trip count loop vectorization is not "
+			     "profitable.\n");
+	  return;
+	}
+
+      *ret_min_profitable_combine_niters = 0;
+      return;
+    }
+
   /* Don't try to vectorize epilogue of epilogue.  */
   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
     return;
@@ -3795,7 +3948,9 @@  vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
     {
       if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
 	{
-	  if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
+	  if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
+	    LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
+	  else if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
 	    *ret_min_profitable_combine_niters = 0;
 	  return;
 	}
@@ -3854,6 +4009,29 @@  vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 			     profitable_iters);
 	  *ret_min_profitable_combine_niters = profitable_iters;
 	}
+
+      if (!(flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK))
+	return;
+
+      /* Now compute profitability for loop epilogue masking.
+	 The following condition must hold true:
+	 SIC * epilogue_niters + SOC > VIC + MIC + MPC  */
+      int min_profitable_masking_niters
+	= (vec_inside_cost + masking_inside_cost + masking_prologue_cost
+	   - scalar_outside_cost) / scalar_single_iter_cost;
+      if (min_profitable_masking_niters > peel_iters_epilogue)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Loop epilogue masking is not pofitable.\n");
+	}
+      else
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Loop epilogue masking is pofitable.\n");
+	  LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
+	}
     }
 }
 
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 5f15246..f70aed6 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -539,7 +539,16 @@  vectorize_loops (void)
 	loop->aux = loop_vinfo;
 
 	if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
-	  continue;
+	  {
+	    if (loop_vinfo
+		&& LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+		&& LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
+		&& dump_enabled_p ())
+	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			       "loop epilogue can't be vectorized.\n");
+
+	    continue;
+	  }
 
         if (!dbg_cnt (vect_loop))
 	  {