diff mbox

[vec-tails,03/10] Support epilogues vectorization with no masking

Message ID 20160519193939.GD40563@msticlxl57.ims.intel.com
State New
Headers show

Commit Message

Ilya Enkovich May 19, 2016, 7:39 p.m. UTC
Hi,

This patch introduces changes required to run vectorizer on loop epilogue.
This also enables epilogue vectorization using a vector of smaller size.

Thanks,
Ilya
--
gcc/

2016-05-19  Ilya Enkovich  <ilya.enkovich@intel.com>

	* tree-if-conv.c (tree_if_conversion): Make public.
	* tree-if-conv.h: New file.
	* tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Don't
	try to enhance alignment for epilogues.
	* tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Return
	created loop.
	* tree-vect-loop.c: include tree-if-conv.h.
	(destroy_loop_vec_info): Preserve LOOP_VINFO_ORIG_LOOP_INFO in
	loop->aux.
	(vect_analyze_loop_form): Init LOOP_VINFO_ORIG_LOOP_INFO and reset
	loop->aux.
	(vect_analyze_loop): Reset loop->aux.
	(vect_transform_loop): Check if created epilogue should be returned
	for further vectorization.  If-convert epilogue if required.
	* tree-vectorizer.c (vectorize_loops): Add a queue of loops to
	process and insert vectorized loop epilogues into this queue.
	* tree-vectorizer.h (vect_do_peeling_for_loop_bound): Return created
	loop.
	(vect_transform_loop): Return created loop.

Comments

Richard Biener June 15, 2016, 11:03 a.m. UTC | #1
On Thu, May 19, 2016 at 9:39 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
> Hi,
>
> This patch introduces changes required to run vectorizer on loop epilogue.
> This also enables epilogue vectorization using a vector of smaller size.

While the idea of epilogue vectorization sounds straight-forward the
implementation
is somewhat icky with all the ->aux stuff, "redundant" if-conversion
and loop iteration stuff.

So I was thinking of when epilogue vectorization is beneficial which
is obviously when
the overall loop trip count is low.  We are not good in optimizing for
that case generally
(too much peeling for alignment, using expensive avx256 vectorization,
etc.), so I wonder
if versioning for that case would be a better idea (performance-wise).

Thus - what cases were you looking at when deciding that vectorizing
the epilogue
(with a smaller vector size) is profitable?  Do other compilers
generally do this?

Thanks,
Richard.

> Thanks,
> Ilya
> --
> gcc/
>
> 2016-05-19  Ilya Enkovich  <ilya.enkovich@intel.com>
>
>         * tree-if-conv.c (tree_if_conversion): Make public.
>         * tree-if-conv.h: New file.
>         * tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Don't
>         try to enhance alignment for epilogues.
>         * tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Return
>         created loop.
>         * tree-vect-loop.c: include tree-if-conv.h.
>         (destroy_loop_vec_info): Preserve LOOP_VINFO_ORIG_LOOP_INFO in
>         loop->aux.
>         (vect_analyze_loop_form): Init LOOP_VINFO_ORIG_LOOP_INFO and reset
>         loop->aux.
>         (vect_analyze_loop): Reset loop->aux.
>         (vect_transform_loop): Check if created epilogue should be returned
>         for further vectorization.  If-convert epilogue if required.
>         * tree-vectorizer.c (vectorize_loops): Add a queue of loops to
>         process and insert vectorized loop epilogues into this queue.
>         * tree-vectorizer.h (vect_do_peeling_for_loop_bound): Return created
>         loop.
>         (vect_transform_loop): Return created loop.
>
>
> diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
> index c38e21b..41b6c99 100644
> --- a/gcc/tree-if-conv.c
> +++ b/gcc/tree-if-conv.c
> @@ -2801,7 +2801,7 @@ ifcvt_local_dce (basic_block bb)
>     profitability analysis.  Returns non-zero todo flags when something
>     changed.  */
>
> -static unsigned int
> +unsigned int
>  tree_if_conversion (struct loop *loop)
>  {
>    unsigned int todo = 0;
> diff --git a/gcc/tree-if-conv.h b/gcc/tree-if-conv.h
> new file mode 100644
> index 0000000..3a732c2
> --- /dev/null
> +++ b/gcc/tree-if-conv.h
> @@ -0,0 +1,24 @@
> +/* Copyright (C) 2016 Free Software Foundation, Inc.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify it under
> +the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 3, or (at your option) any later
> +version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> +for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_TREE_IF_CONV_H
> +#define GCC_TREE_IF_CONV_H
> +
> +unsigned int tree_if_conversion (struct loop *);
> +
> +#endif  /* GCC_TREE_IF_CONV_H  */
> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> index 7652e21..f275933 100644
> --- a/gcc/tree-vect-data-refs.c
> +++ b/gcc/tree-vect-data-refs.c
> @@ -1595,7 +1595,10 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
>    /* Check if we can possibly peel the loop.  */
>    if (!vect_can_advance_ivs_p (loop_vinfo)
>        || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
> -      || loop->inner)
> +      || loop->inner
> +      /* Required peeling was performed in prologue and
> +        is not required for epilogue.  */
> +      || LOOP_VINFO_EPILOGUE_P (loop_vinfo))
>      do_peeling = false;
>
>    if (do_peeling
> @@ -1875,7 +1878,10 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
>
>    do_versioning =
>         optimize_loop_nest_for_speed_p (loop)
> -       && (!loop->inner); /* FORNOW */
> +       && (!loop->inner) /* FORNOW */
> +        /* Required versioning was performed for the
> +          original loop and is not required for epilogue.  */
> +       && !LOOP_VINFO_EPILOGUE_P (loop_vinfo);
>
>    if (do_versioning)
>      {
> diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
> index 7ec6dae..fab5879 100644
> --- a/gcc/tree-vect-loop-manip.c
> +++ b/gcc/tree-vect-loop-manip.c
> @@ -1742,9 +1742,11 @@ vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
>     NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO).
>
>     COND_EXPR and COND_EXPR_STMT_LIST are combined with a new generated
> -   test.  */
> +   test.
>
> -void
> +   Return created loop.  */
> +
> +struct loop *
>  vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo,
>                                 tree ni_name, tree ratio_mult_vf_name,
>                                 unsigned int th, bool check_profitability)
> @@ -1812,6 +1814,8 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo,
>    scev_reset ();
>
>    free_original_copy_tables ();
> +
> +  return new_loop;
>  }
>
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index aac0df9..a537ef4 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -47,6 +47,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-vectorizer.h"
>  #include "gimple-fold.h"
>  #include "cgraph.h"
> +#include "tree-if-conv.h"
>
>  /* Loop Vectorization Pass.
>
> @@ -1212,8 +1213,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
>    destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
>    loop_vinfo->scalar_cost_vec.release ();
>
> +  loop->aux = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
>    free (loop_vinfo);
> -  loop->aux = NULL;
>  }
>
>
> @@ -1499,13 +1500,24 @@ vect_analyze_loop_form (struct loop *loop)
>
>    if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
>                                   &number_of_iterations, &inner_loop_cond))
> -    return NULL;
> +    {
> +      loop->aux = NULL;
> +      return NULL;
> +    }
>
>    loop_vec_info loop_vinfo = new_loop_vec_info (loop);
>    LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
>    LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
>    LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
>
> +  /* For epilogues we want to vectorize aux holds
> +     loop_vec_info of the original loop.  */
> +  if (loop->aux)
> +    {
> +      gcc_assert (LOOP_VINFO_VECTORIZABLE_P ((loop_vec_info)loop->aux));
> +      LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = (loop_vec_info)loop->aux;
> +    }
> +
>    if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
>      {
>        if (dump_enabled_p ())
> @@ -1522,7 +1534,6 @@ vect_analyze_loop_form (struct loop *loop)
>      STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
>        = loop_exit_ctrl_vec_info_type;
>
> -  gcc_assert (!loop->aux);
>    loop->aux = loop_vinfo;
>    return loop_vinfo;
>  }
> @@ -2280,7 +2291,10 @@ vect_analyze_loop (struct loop *loop)
>        if (fatal
>           || vector_sizes == 0
>           || current_vector_size == 0)
> -       return NULL;
> +       {
> +         loop->aux = NULL;
> +         return NULL;
> +       }
>
>        /* Try the next biggest vector size.  */
>        current_vector_size = 1 << floor_log2 (vector_sizes);
> @@ -6576,10 +6590,11 @@ vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
>     Vectorize the loop - created vectorized stmts to replace the scalar
>     stmts in the loop, and update the loop exit condition.  */
>
> -void
> +struct loop *
>  vect_transform_loop (loop_vec_info loop_vinfo)
>  {
>    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> +  struct loop *epilogue = NULL;
>    basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
>    int nbbs = loop->num_nodes;
>    int i;
> @@ -6661,8 +6676,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
>         ni_name = vect_build_loop_niters (loop_vinfo);
>        vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
>                                        &ratio);
> -      vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
> -                                     th, check_profitability);
> +      epilogue = vect_do_peeling_for_loop_bound (loop_vinfo, ni_name,
> +                                                ratio_mult_vf, th,
> +                                                check_profitability);
>      }
>    else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
>      ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
> @@ -6959,6 +6975,64 @@ vect_transform_loop (loop_vec_info loop_vinfo)
>    FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
>      vect_free_slp_instance (instance);
>    LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
> +
> +  /* Don't vectorize epilogue for epilogue.  */
> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
> +    epilogue = NULL;
> +  /* Scalar epilogue is not vectorized in case
> +     we use combined vector epilogue.  */
> +  else if (LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo))
> +    epilogue = NULL;
> +  /* FORNOW: Currently alias checks are not inherited for epilogues.
> +     Don't try to vectorize epilogue because it will require
> +     additional alias checks.  */
> +  else if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
> +    epilogue = NULL;
> +
> +  if (epilogue)
> +    {
> +      if (!LOOP_VINFO_MASK_EPILOGUE (loop_vinfo))
> +       {
> +         unsigned int vector_sizes
> +           = targetm.vectorize.autovectorize_vector_sizes ();
> +         vector_sizes &= current_vector_size - 1;
> +
> +         if (!(flag_tree_vectorize_epilogues & VECT_EPILOGUE_NOMASK))
> +           epilogue = NULL;
> +         else if (!vector_sizes)
> +           epilogue = NULL;
> +         else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> +                  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
> +           {
> +             int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
> +             int ratio = current_vector_size / smallest_vec_size;
> +             int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
> +               - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
> +             eiters = eiters % vectorization_factor;
> +
> +             epilogue->nb_iterations_upper_bound = eiters - 1;
> +
> +             if (eiters < vectorization_factor / ratio)
> +               epilogue = NULL;
> +           }
> +       }
> +    }
> +
> +  if (epilogue)
> +    {
> +      epilogue->force_vectorize = loop->force_vectorize;
> +      epilogue->safelen = loop->safelen;
> +      epilogue->dont_vectorize = false;
> +
> +      /* We may need to if-convert epilogue to vectorize it.  */
> +      if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
> +       tree_if_conversion (epilogue);
> +
> +      gcc_assert (!epilogue->aux);
> +      epilogue->aux = loop_vinfo;
> +    }
> +
> +  return epilogue;
>  }
>
>  /* The code below is trying to perform simple optimization - revert
> diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
> index 2b25b45..5f15246 100644
> --- a/gcc/tree-vectorizer.c
> +++ b/gcc/tree-vectorizer.c
> @@ -491,14 +491,16 @@ vectorize_loops (void)
>  {
>    unsigned int i;
>    unsigned int num_vectorized_loops = 0;
> -  unsigned int vect_loops_num;
> +  unsigned int vect_loops_num = number_of_loops (cfun);
>    struct loop *loop;
>    hash_table<simduid_to_vf> *simduid_to_vf_htab = NULL;
>    hash_table<simd_array_to_simduid> *simd_array_to_simduid_htab = NULL;
>    bool any_ifcvt_loops = false;
>    unsigned ret = 0;
> +  auto_vec<unsigned int> loops (vect_loops_num);
>
> -  vect_loops_num = number_of_loops (cfun);
> +  FOR_EACH_LOOP (loop, 0)
> +    loops.quick_push (loop->num);
>
>    /* Bail out if there are no loops.  */
>    if (vect_loops_num <= 1)
> @@ -514,14 +516,18 @@ vectorize_loops (void)
>    /* If some loop was duplicated, it gets bigger number
>       than all previously defined loops.  This fact allows us to run
>       only over initial loops skipping newly generated ones.  */
> -  FOR_EACH_LOOP (loop, 0)
> -    if (loop->dont_vectorize)
> +  for (i = 0; i < loops.length (); i++)
> +    if (!(loop = get_loop (cfun, loops[i])))
> +      continue;
> +    else if (loop->dont_vectorize)
>        any_ifcvt_loops = true;
>      else if ((flag_tree_loop_vectorize
> -             && optimize_loop_nest_for_speed_p (loop))
> +             && (optimize_loop_nest_for_speed_p (loop)
> +                 || loop->aux))
>              || loop->force_vectorize)
>        {
>         loop_vec_info loop_vinfo;
> +       struct loop *new_loop;
>         vect_location = find_loop_location (loop);
>          if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
>             && dump_enabled_p ())
> @@ -551,12 +557,21 @@ vectorize_loops (void)
>             && dump_enabled_p ())
>            dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
>                             "loop vectorized\n");
> -       vect_transform_loop (loop_vinfo);
> +       new_loop = vect_transform_loop (loop_vinfo);
>         num_vectorized_loops++;
>         /* Now that the loop has been vectorized, allow it to be unrolled
>            etc.  */
>         loop->force_vectorize = false;
>
> +       /* Add new loop to a processing queue.  To make it easier
> +          to match loop and its epilogue vectorization in dumps
> +          put new loop as the next loop to process.  */
> +       if (new_loop)
> +         {
> +           loops.safe_insert (i + 1, new_loop->num);
> +           vect_loops_num = number_of_loops (cfun);
> +         }
> +
>         if (loop->simduid)
>           {
>             simduid_to_vf *simduid_to_vf_data = XNEW (simduid_to_vf);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 4c19317..b269752 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -984,8 +984,8 @@ extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge);
>  struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *,
>                                                      struct loop *, edge);
>  extern void vect_loop_versioning (loop_vec_info, unsigned int, bool);
> -extern void vect_do_peeling_for_loop_bound (loop_vec_info, tree, tree,
> -                                           unsigned int, bool);
> +extern struct loop *vect_do_peeling_for_loop_bound (loop_vec_info, tree, tree,
> +                                                   unsigned int, bool);
>  extern void vect_do_peeling_for_alignment (loop_vec_info, tree,
>                                            unsigned int, bool);
>  extern source_location find_loop_location (struct loop *);
> @@ -1101,7 +1101,7 @@ extern gimple *vect_force_simple_reduction (loop_vec_info, gimple *, bool,
>  /* Drive for loop analysis stage.  */
>  extern loop_vec_info vect_analyze_loop (struct loop *);
>  /* Drive for loop transformation stage.  */
> -extern void vect_transform_loop (loop_vec_info);
> +extern struct loop *vect_transform_loop (loop_vec_info);
>  extern loop_vec_info vect_analyze_loop_form (struct loop *);
>  extern bool vectorizable_live_operation (gimple *, gimple_stmt_iterator *,
>                                          gimple **);
Jeff Law June 16, 2016, 5:22 a.m. UTC | #2
On 06/15/2016 05:03 AM, Richard Biener wrote:
> On Thu, May 19, 2016 at 9:39 PM, Ilya Enkovich
> <enkovich.gnu@gmail.com> wrote:
>> Hi,
>>
>> This patch introduces changes required to run vectorizer on loop
>> epilogue. This also enables epilogue vectorization using a vector
>> of smaller size.
>
> While the idea of epilogue vectorization sounds straight-forward the
> implementation is somewhat icky with all the ->aux stuff, "redundant"
> if-conversion and loop iteration stuff.
>
> So I was thinking of when epilogue vectorization is beneficial which
> is obviously when the overall loop trip count is low.  We are not
> good in optimizing for that case generally (too much peeling for
> alignment, using expensive avx256 vectorization, etc.), so I wonder
> if versioning for that case would be a better idea
> (performance-wise).
>
> Thus - what cases were you looking at when deciding that vectorizing
> the epilogue (with a smaller vector size) is profitable?  Do other
> compilers generally do this?
I would think it's better stated that the relative benefits of 
vectorizing the epilogue are greater the shorter the loop, but that's 
nit-picking the discussion.

I do think you've got a legitimate question though.   Ilya, can you give 
any insights here based on your KNL and Haswell testing or data/insights 
from the LLVM and/or ICC teams?

Jeff
Jeff Law June 16, 2016, 6 a.m. UTC | #3
On 05/19/2016 01:39 PM, Ilya Enkovich wrote:
> Hi,
>
> This patch introduces changes required to run vectorizer on loop epilogue.
> This also enables epilogue vectorization using a vector of smaller size.
>
> Thanks,
> Ilya
> --
> gcc/
>
> 2016-05-19  Ilya Enkovich  <ilya.enkovich@intel.com>
>
> 	* tree-if-conv.c (tree_if_conversion): Make public.
> 	* tree-if-conv.h: New file.
> 	* tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Don't
> 	try to enhance alignment for epilogues.
> 	* tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Return
> 	created loop.
> 	* tree-vect-loop.c: include tree-if-conv.h.
> 	(destroy_loop_vec_info): Preserve LOOP_VINFO_ORIG_LOOP_INFO in
> 	loop->aux.
> 	(vect_analyze_loop_form): Init LOOP_VINFO_ORIG_LOOP_INFO and reset
> 	loop->aux.
> 	(vect_analyze_loop): Reset loop->aux.
> 	(vect_transform_loop): Check if created epilogue should be returned
> 	for further vectorization.  If-convert epilogue if required.
> 	* tree-vectorizer.c (vectorize_loops): Add a queue of loops to
> 	process and insert vectorized loop epilogues into this queue.
> 	* tree-vectorizer.h (vect_do_peeling_for_loop_bound): Return created
> 	loop.
> 	(vect_transform_loop): Return created loop.
As Richi noted, the additional calls into the if-converter are 
unfortunate.  I'm not sure how else to avoid them though.  It looks like 
we can run if-conversion on just the epilogue, so maybe that's not too bad.


> @@ -1212,8 +1213,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
>    destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
>    loop_vinfo->scalar_cost_vec.release ();
>
> +  loop->aux = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
>    free (loop_vinfo);
> -  loop->aux = NULL;
>  }
Hmm, there seems to be a level of indirection I'm missing here.  We're 
smuggling LOOP_VINFO_ORIG_LOOP_INFO around in loop->aux.  Ewww.  I 
thought the whole point of LOOP_VINFO_ORIG_LOOP_INFO was to smuggle the 
VINFO from the original loop to the vectorized epilogue.  What am I 
missing?  Rather than smuggling around in the aux field, is there some 
inherent reason why we can't just copy the info from the original loop 
directly into LOOP_VINFO_ORIG_LOOP_INFO for the vectorized epilogue?

> +  /* FORNOW: Currently alias checks are not inherited for epilogues.
> +     Don't try to vectorize epilogue because it will require
> +     additional alias checks.  */
Are the alias checks here redundant with the ones done for the original 
loop?  If so won't DOM eliminate them?


And something just occurred to me -- is there some inherent reason why 
SLP doesn't vectorize the epilogue, particularly for the cases where we 
can vectorize the epilogue using smaller vectors?  Sorry if you've 
already answered this somewhere or it's a dumb question.



>
> +	/* Add new loop to a processing queue.  To make it easier
> +	   to match loop and its epilogue vectorization in dumps
> +	   put new loop as the next loop to process.  */
> +	if (new_loop)
> +	  {
> +	    loops.safe_insert (i + 1, new_loop->num);
> +	    vect_loops_num = number_of_loops (cfun);
> +	  }
> +
So just to be clear, the only reason to do this is for dumps -- other 
than processing the loop before it's epilogue, there's no other 
inherently necessary ordering of the loops, right?


Jeff
Ilya Enkovich June 17, 2016, 2:16 p.m. UTC | #4
2016-06-16 8:22 GMT+03:00 Jeff Law <law@redhat.com>:
> On 06/15/2016 05:03 AM, Richard Biener wrote:
>>
>> On Thu, May 19, 2016 at 9:39 PM, Ilya Enkovich
>> <enkovich.gnu@gmail.com> wrote:
>>>
>>> Hi,
>>>
>>> This patch introduces changes required to run vectorizer on loop
>>> epilogue. This also enables epilogue vectorization using a vector
>>> of smaller size.
>>
>>
>> While the idea of epilogue vectorization sounds straight-forward the
>> implementation is somewhat icky with all the ->aux stuff, "redundant"
>> if-conversion and loop iteration stuff.
>>
>> So I was thinking of when epilogue vectorization is beneficial which
>> is obviously when the overall loop trip count is low.  We are not
>> good in optimizing for that case generally (too much peeling for
>> alignment, using expensive avx256 vectorization, etc.), so I wonder
>> if versioning for that case would be a better idea
>> (performance-wise).
>>
>> Thus - what cases were you looking at when deciding that vectorizing
>> the epilogue (with a smaller vector size) is profitable?  Do other
>> compilers generally do this?
>
> I would think it's better stated that the relative benefits of vectorizing
> the epilogue are greater the shorter the loop, but that's nit-picking the
> discussion.
>
> I do think you've got a legitimate question though.   Ilya, can you give any
> insights here based on your KNL and Haswell testing or data/insights from
> the LLVM and/or ICC teams?

I have no information about LLVM.  As I said in other thread ICC uses all
options (masked epilogue, combined loop, vectorized epilogue with smaller
vector size).  It also may generate different versions (e.g. combined and
with masked epilogue) and choose dynamically depending on iterations count.

Thanks,
Ilya

>
> Jeff
Ilya Enkovich June 17, 2016, 2:33 p.m. UTC | #5
2016-06-16 9:00 GMT+03:00 Jeff Law <law@redhat.com>:
> On 05/19/2016 01:39 PM, Ilya Enkovich wrote:
>>
>> Hi,
>>
>> This patch introduces changes required to run vectorizer on loop epilogue.
>> This also enables epilogue vectorization using a vector of smaller size.
>>
>> Thanks,
>> Ilya
>> --
>> gcc/
>>
>> 2016-05-19  Ilya Enkovich  <ilya.enkovich@intel.com>
>>
>>         * tree-if-conv.c (tree_if_conversion): Make public.
>>         * tree-if-conv.h: New file.
>>         * tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Don't
>>         try to enhance alignment for epilogues.
>>         * tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Return
>>         created loop.
>>         * tree-vect-loop.c: include tree-if-conv.h.
>>         (destroy_loop_vec_info): Preserve LOOP_VINFO_ORIG_LOOP_INFO in
>>         loop->aux.
>>         (vect_analyze_loop_form): Init LOOP_VINFO_ORIG_LOOP_INFO and reset
>>         loop->aux.
>>         (vect_analyze_loop): Reset loop->aux.
>>         (vect_transform_loop): Check if created epilogue should be
>> returned
>>         for further vectorization.  If-convert epilogue if required.
>>         * tree-vectorizer.c (vectorize_loops): Add a queue of loops to
>>         process and insert vectorized loop epilogues into this queue.
>>         * tree-vectorizer.h (vect_do_peeling_for_loop_bound): Return
>> created
>>         loop.
>>         (vect_transform_loop): Return created loop.
>
> As Richi noted, the additional calls into the if-converter are unfortunate.
> I'm not sure how else to avoid them though.  It looks like we can run
> if-conversion on just the epilogue, so maybe that's not too bad.
>
>
>> @@ -1212,8 +1213,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo,
>> bool clean_stmts)
>>    destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
>>    loop_vinfo->scalar_cost_vec.release ();
>>
>> +  loop->aux = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
>>    free (loop_vinfo);
>> -  loop->aux = NULL;
>>  }
>
> Hmm, there seems to be a level of indirection I'm missing here.  We're
> smuggling LOOP_VINFO_ORIG_LOOP_INFO around in loop->aux.  Ewww.  I thought
> the whole point of LOOP_VINFO_ORIG_LOOP_INFO was to smuggle the VINFO from
> the original loop to the vectorized epilogue.  What am I missing?  Rather
> than smuggling around in the aux field, is there some inherent reason why we
> can't just copy the info from the original loop directly into
> LOOP_VINFO_ORIG_LOOP_INFO for the vectorized epilogue?

LOOP_VINFO_ORIG_LOOP_INFO is used for several things:
 - mark this loop as epilogue
 - get VF of original loop (required for both mask and nomask modes)
 - get decision about epilogue masking

That's all.  When epilogue is created it has no LOOP_VINFO.  Also when we
vectorize loop we create and destroy its LOOP_VINFO multiple times.  When
loop has LOOP_VINFO loop->aux points to it and original LOOP_VINFO is in
LOOP_VINFO_ORIG_LOOP_INFO.  When Loop has no LOOP_VINFO associated I have no
place to bind it with the original loop and therefore I use vacant loop->aux
for that.  Any other way to bind epilogue with its original loop would work
as well.  I just chose loop->aux to avoid new fields and data structures.

>
>> +  /* FORNOW: Currently alias checks are not inherited for epilogues.
>> +     Don't try to vectorize epilogue because it will require
>> +     additional alias checks.  */
>
> Are the alias checks here redundant with the ones done for the original
> loop?  If so won't DOM eliminate them?

I revisited this part recently and thought it should actually be safe to
assume we have no aliasing in epilogue because we are dominated by alias
checks of the original loop.  So I prepared a patch to remove this restriction
and avoid alias checks generation for epilogues (so we compute aliases checks
required but don't emit them).  I didn't send this patch yet.
Do you think it is a valid assumption?

>
>
> And something just occurred to me -- is there some inherent reason why SLP
> doesn't vectorize the epilogue, particularly for the cases where we can
> vectorize the epilogue using smaller vectors?  Sorry if you've already
> answered this somewhere or it's a dumb question.

IIUC this may happen only if we unroll epilogue into a single BB which happens
only when epilogue iterations count is known. Right?

>
>
>
>>
>> +       /* Add new loop to a processing queue.  To make it easier
>> +          to match loop and its epilogue vectorization in dumps
>> +          put new loop as the next loop to process.  */
>> +       if (new_loop)
>> +         {
>> +           loops.safe_insert (i + 1, new_loop->num);
>> +           vect_loops_num = number_of_loops (cfun);
>> +         }
>> +
>
> So just to be clear, the only reason to do this is for dumps -- other than
> processing the loop before it's epilogue, there's no other inherently
> necessary ordering of the loops, right?

Right, I don't see other reasons to do it.

Thanks,
Ilya

>
>
> Jeff
Bin.Cheng June 17, 2016, 2:48 p.m. UTC | #6
On Fri, Jun 17, 2016 at 3:33 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
> 2016-06-16 9:00 GMT+03:00 Jeff Law <law@redhat.com>:
>> On 05/19/2016 01:39 PM, Ilya Enkovich wrote:
>>>
>>> Hi,
>>>
>>> This patch introduces changes required to run vectorizer on loop epilogue.
>>> This also enables epilogue vectorization using a vector of smaller size.
>>>
>>> Thanks,
>>> Ilya
>>> --
>>> gcc/
>>>
>>> 2016-05-19  Ilya Enkovich  <ilya.enkovich@intel.com>
>>>
>>>         * tree-if-conv.c (tree_if_conversion): Make public.
>>>         * tree-if-conv.h: New file.
>>>         * tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Don't
>>>         try to enhance alignment for epilogues.
>>>         * tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Return
>>>         created loop.
>>>         * tree-vect-loop.c: include tree-if-conv.h.
>>>         (destroy_loop_vec_info): Preserve LOOP_VINFO_ORIG_LOOP_INFO in
>>>         loop->aux.
>>>         (vect_analyze_loop_form): Init LOOP_VINFO_ORIG_LOOP_INFO and reset
>>>         loop->aux.
>>>         (vect_analyze_loop): Reset loop->aux.
>>>         (vect_transform_loop): Check if created epilogue should be
>>> returned
>>>         for further vectorization.  If-convert epilogue if required.
>>>         * tree-vectorizer.c (vectorize_loops): Add a queue of loops to
>>>         process and insert vectorized loop epilogues into this queue.
>>>         * tree-vectorizer.h (vect_do_peeling_for_loop_bound): Return
>>> created
>>>         loop.
>>>         (vect_transform_loop): Return created loop.
>>
>> As Richi noted, the additional calls into the if-converter are unfortunate.
>> I'm not sure how else to avoid them though.  It looks like we can run
>> if-conversion on just the epilogue, so maybe that's not too bad.
>>
>>
>>> @@ -1212,8 +1213,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo,
>>> bool clean_stmts)
>>>    destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
>>>    loop_vinfo->scalar_cost_vec.release ();
>>>
>>> +  loop->aux = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
>>>    free (loop_vinfo);
>>> -  loop->aux = NULL;
>>>  }
>>
>> Hmm, there seems to be a level of indirection I'm missing here.  We're
>> smuggling LOOP_VINFO_ORIG_LOOP_INFO around in loop->aux.  Ewww.  I thought
>> the whole point of LOOP_VINFO_ORIG_LOOP_INFO was to smuggle the VINFO from
>> the original loop to the vectorized epilogue.  What am I missing?  Rather
>> than smuggling around in the aux field, is there some inherent reason why we
>> can't just copy the info from the original loop directly into
>> LOOP_VINFO_ORIG_LOOP_INFO for the vectorized epilogue?
>
> LOOP_VINFO_ORIG_LOOP_INFO is used for several things:
>  - mark this loop as epilogue
>  - get VF of original loop (required for both mask and nomask modes)
>  - get decision about epilogue masking
>
> That's all.  When epilogue is created it has no LOOP_VINFO.  Also when we
> vectorize loop we create and destroy its LOOP_VINFO multiple times.  When
> loop has LOOP_VINFO loop->aux points to it and original LOOP_VINFO is in
> LOOP_VINFO_ORIG_LOOP_INFO.  When Loop has no LOOP_VINFO associated I have no
> place to bind it with the original loop and therefore I use vacant loop->aux
> for that.  Any other way to bind epilogue with its original loop would work
> as well.  I just chose loop->aux to avoid new fields and data structures.
>
>>
>>> +  /* FORNOW: Currently alias checks are not inherited for epilogues.
>>> +     Don't try to vectorize epilogue because it will require
>>> +     additional alias checks.  */
>>
>> Are the alias checks here redundant with the ones done for the original
>> loop?  If so won't DOM eliminate them?
>
> I revisited this part recently and thought it should actually be safe to
> assume we have no aliasing in epilogue because we are dominated by alias
> checks of the original loop.  So I prepared a patch to remove this restriction
> and avoid alias checks generation for epilogues (so we compute aliases checks
> required but don't emit them).  I didn't send this patch yet.
> Do you think it is a valid assumption?
I recently visited that part and agree it's valid, unless epilogue
loop is vectorized in larger vector-units, but that would be unlikely
to happen, right?  BTW, does this patch start all over analyzing
epilogue loop?  As you said the alias checks will be computed.

Thanks,
bin
>
>>
>>
>> And something just occurred to me -- is there some inherent reason why SLP
>> doesn't vectorize the epilogue, particularly for the cases where we can
>> vectorize the epilogue using smaller vectors?  Sorry if you've already
>> answered this somewhere or it's a dumb question.
>
> IIUC this may happen only if we unroll epilogue into a single BB which happens
> only when epilogue iterations count is known. Right?
>
>>
>>
>>
>>>
>>> +       /* Add new loop to a processing queue.  To make it easier
>>> +          to match loop and its epilogue vectorization in dumps
>>> +          put new loop as the next loop to process.  */
>>> +       if (new_loop)
>>> +         {
>>> +           loops.safe_insert (i + 1, new_loop->num);
>>> +           vect_loops_num = number_of_loops (cfun);
>>> +         }
>>> +
>>
>> So just to be clear, the only reason to do this is for dumps -- other than
>> processing the loop before it's epilogue, there's no other inherently
>> necessary ordering of the loops, right?
>
> Right, I don't see other reasons to do it.
>
> Thanks,
> Ilya
>
>>
>>
>> Jeff
Ilya Enkovich June 17, 2016, 2:54 p.m. UTC | #7
2016-06-17 17:48 GMT+03:00 Bin.Cheng <amker.cheng@gmail.com>:
> On Fri, Jun 17, 2016 at 3:33 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
>> 2016-06-16 9:00 GMT+03:00 Jeff Law <law@redhat.com>:
>>> On 05/19/2016 01:39 PM, Ilya Enkovich wrote:
>>>>
>>>> Hi,
>>>>
>>>> This patch introduces changes required to run vectorizer on loop epilogue.
>>>> This also enables epilogue vectorization using a vector of smaller size.
>>>>
>>>> Thanks,
>>>> Ilya
>>>> --
>>>> gcc/
>>>>
>>>> 2016-05-19  Ilya Enkovich  <ilya.enkovich@intel.com>
>>>>
>>>>         * tree-if-conv.c (tree_if_conversion): Make public.
>>>>         * tree-if-conv.h: New file.
>>>>         * tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Don't
>>>>         try to enhance alignment for epilogues.
>>>>         * tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Return
>>>>         created loop.
>>>>         * tree-vect-loop.c: include tree-if-conv.h.
>>>>         (destroy_loop_vec_info): Preserve LOOP_VINFO_ORIG_LOOP_INFO in
>>>>         loop->aux.
>>>>         (vect_analyze_loop_form): Init LOOP_VINFO_ORIG_LOOP_INFO and reset
>>>>         loop->aux.
>>>>         (vect_analyze_loop): Reset loop->aux.
>>>>         (vect_transform_loop): Check if created epilogue should be
>>>> returned
>>>>         for further vectorization.  If-convert epilogue if required.
>>>>         * tree-vectorizer.c (vectorize_loops): Add a queue of loops to
>>>>         process and insert vectorized loop epilogues into this queue.
>>>>         * tree-vectorizer.h (vect_do_peeling_for_loop_bound): Return
>>>> created
>>>>         loop.
>>>>         (vect_transform_loop): Return created loop.
>>>
>>> As Richi noted, the additional calls into the if-converter are unfortunate.
>>> I'm not sure how else to avoid them though.  It looks like we can run
>>> if-conversion on just the epilogue, so maybe that's not too bad.
>>>
>>>
>>>> @@ -1212,8 +1213,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo,
>>>> bool clean_stmts)
>>>>    destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
>>>>    loop_vinfo->scalar_cost_vec.release ();
>>>>
>>>> +  loop->aux = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
>>>>    free (loop_vinfo);
>>>> -  loop->aux = NULL;
>>>>  }
>>>
>>> Hmm, there seems to be a level of indirection I'm missing here.  We're
>>> smuggling LOOP_VINFO_ORIG_LOOP_INFO around in loop->aux.  Ewww.  I thought
>>> the whole point of LOOP_VINFO_ORIG_LOOP_INFO was to smuggle the VINFO from
>>> the original loop to the vectorized epilogue.  What am I missing?  Rather
>>> than smuggling around in the aux field, is there some inherent reason why we
>>> can't just copy the info from the original loop directly into
>>> LOOP_VINFO_ORIG_LOOP_INFO for the vectorized epilogue?
>>
>> LOOP_VINFO_ORIG_LOOP_INFO is used for several things:
>>  - mark this loop as epilogue
>>  - get VF of original loop (required for both mask and nomask modes)
>>  - get decision about epilogue masking
>>
>> That's all.  When epilogue is created it has no LOOP_VINFO.  Also when we
>> vectorize loop we create and destroy its LOOP_VINFO multiple times.  When
>> loop has LOOP_VINFO loop->aux points to it and original LOOP_VINFO is in
>> LOOP_VINFO_ORIG_LOOP_INFO.  When Loop has no LOOP_VINFO associated I have no
>> place to bind it with the original loop and therefore I use vacant loop->aux
>> for that.  Any other way to bind epilogue with its original loop would work
>> as well.  I just chose loop->aux to avoid new fields and data structures.
>>
>>>
>>>> +  /* FORNOW: Currently alias checks are not inherited for epilogues.
>>>> +     Don't try to vectorize epilogue because it will require
>>>> +     additional alias checks.  */
>>>
>>> Are the alias checks here redundant with the ones done for the original
>>> loop?  If so won't DOM eliminate them?
>>
>> I revisited this part recently and thought it should actually be safe to
>> assume we have no aliasing in epilogue because we are dominated by alias
>> checks of the original loop.  So I prepared a patch to remove this restriction
>> and avoid alias checks generation for epilogues (so we compute aliases checks
>> required but don't emit them).  I didn't send this patch yet.
>> Do you think it is a valid assumption?
> I recently visited that part and agree it's valid, unless epilogue
> loop is vectorized in larger vector-units, but that would be unlikely
> to happen, right?  BTW, does this patch start all over analyzing
> epilogue loop?  As you said the alias checks will be computed.

Original loop is vectorized for the max possible vector size and we can't
(and don't want to) choose a bigger one.

We don't preserve any info for epilogue.  Actually even when we try various
vector sizes for a single loop we recompute everything for each vector size.

Thanks,
Ilya

>
> Thanks,
> bin
>>
>>>
>>>
>>> And something just occurred to me -- is there some inherent reason why SLP
>>> doesn't vectorize the epilogue, particularly for the cases where we can
>>> vectorize the epilogue using smaller vectors?  Sorry if you've already
>>> answered this somewhere or it's a dumb question.
>>
>> IIUC this may happen only if we unroll epilogue into a single BB which happens
>> only when epilogue iterations count is known. Right?
>>
>>>
>>>
>>>
>>>>
>>>> +       /* Add new loop to a processing queue.  To make it easier
>>>> +          to match loop and its epilogue vectorization in dumps
>>>> +          put new loop as the next loop to process.  */
>>>> +       if (new_loop)
>>>> +         {
>>>> +           loops.safe_insert (i + 1, new_loop->num);
>>>> +           vect_loops_num = number_of_loops (cfun);
>>>> +         }
>>>> +
>>>
>>> So just to be clear, the only reason to do this is for dumps -- other than
>>> processing the loop before it's epilogue, there's no other inherently
>>> necessary ordering of the loops, right?
>>
>> Right, I don't see other reasons to do it.
>>
>> Thanks,
>> Ilya
>>
>>>
>>>
>>> Jeff
Jeff Law June 17, 2016, 3:37 p.m. UTC | #8
On 06/17/2016 08:48 AM, Bin.Cheng wrote:
>>>
>>>> +  /* FORNOW: Currently alias checks are not inherited for epilogues.
>>>> +     Don't try to vectorize epilogue because it will require
>>>> +     additional alias checks.  */
>>>
>>> Are the alias checks here redundant with the ones done for the original
>>> loop?  If so won't DOM eliminate them?
>>
>> I revisited this part recently and thought it should actually be safe to
>> assume we have no aliasing in epilogue because we are dominated by alias
>> checks of the original loop.  So I prepared a patch to remove this restriction
>> and avoid alias checks generation for epilogues (so we compute aliases checks
>> required but don't emit them).  I didn't send this patch yet.
>> Do you think it is a valid assumption?
> I recently visited that part and agree it's valid, unless epilogue
> loop is vectorized in larger vector-units, but that would be unlikely
> to happen, right?  BTW, does this patch start all over analyzing
> epilogue loop?  As you said the alias checks will be computed.
I think we're OK either way.  If you emit the checks, DOM ought to 
eliminate them as they'd be dominated by the earlier check.

But I'm a fan of not generating dumb code for later passes to clean up, 
so I think we should just avoid generating the additional checks if we 
can reasonably do so in the vectorizer.

I can't envision a scenario where we'd want a larger vector size in the 
epilogue than the main loop.

Jeff
Jeff Law June 17, 2016, 3:39 p.m. UTC | #9
On 06/17/2016 08:16 AM, Ilya Enkovich wrote:
>>
>> I do think you've got a legitimate question though.   Ilya, can you give any
>> insights here based on your KNL and Haswell testing or data/insights from
>> the LLVM and/or ICC teams?
>
> I have no information about LLVM.  As I said in other thread ICC uses all
> options (masked epilogue, combined loop, vectorized epilogue with smaller
> vector size).  It also may generate different versions (e.g. combined and
> with masked epilogue) and choose dynamically depending on iterations count.
Any guidance from the ICC team on the costing model to choose between 
the different approaches?

I'm a bit surprised that there's enough value in doing this much work to 
vectorize the epilogue, but that appears to be the case...

jeff
Bin.Cheng June 17, 2016, 3:49 p.m. UTC | #10
On Fri, Jun 17, 2016 at 4:37 PM, Jeff Law <law@redhat.com> wrote:
> On 06/17/2016 08:48 AM, Bin.Cheng wrote:
>>>>
>>>>
>>>>> +  /* FORNOW: Currently alias checks are not inherited for epilogues.
>>>>> +     Don't try to vectorize epilogue because it will require
>>>>> +     additional alias checks.  */
>>>>
>>>>
>>>> Are the alias checks here redundant with the ones done for the original
>>>> loop?  If so won't DOM eliminate them?
>>>
>>>
>>> I revisited this part recently and thought it should actually be safe to
>>> assume we have no aliasing in epilogue because we are dominated by alias
>>> checks of the original loop.  So I prepared a patch to remove this
>>> restriction
>>> and avoid alias checks generation for epilogues (so we compute aliases
>>> checks
>>> required but don't emit them).  I didn't send this patch yet.
>>> Do you think it is a valid assumption?
>>
>> I recently visited that part and agree it's valid, unless epilogue
>> loop is vectorized in larger vector-units, but that would be unlikely
>> to happen, right?  BTW, does this patch start all over analyzing
>> epilogue loop?  As you said the alias checks will be computed.
>
> I think we're OK either way.  If you emit the checks, DOM ought to eliminate
> them as they'd be dominated by the earlier check.
Unfortunately DOM probably can't.  Especially constant offsets are
folded deep in expressions and they could be different under smaller
vector-units.  Even it can, it will introduce long live range since
check result will be combined with some others.  Not sure if all
checks can be avoided, alignment checks should be ok too?

Thanks,
bin
>
> But I'm a fan of not generating dumb code for later passes to clean up, so I
> think we should just avoid generating the additional checks if we can
> reasonably do so in the vectorizer.
>
> I can't envision a scenario where we'd want a larger vector size in the
> epilogue than the main loop.
>
> Jeff
>
Jeff Law June 17, 2016, 4:46 p.m. UTC | #11
On 06/17/2016 08:33 AM, Ilya Enkovich wrote:
>>
>> Hmm, there seems to be a level of indirection I'm missing here.  We're
>> smuggling LOOP_VINFO_ORIG_LOOP_INFO around in loop->aux.  Ewww.  I thought
>> the whole point of LOOP_VINFO_ORIG_LOOP_INFO was to smuggle the VINFO from
>> the original loop to the vectorized epilogue.  What am I missing?  Rather
>> than smuggling around in the aux field, is there some inherent reason why we
>> can't just copy the info from the original loop directly into
>> LOOP_VINFO_ORIG_LOOP_INFO for the vectorized epilogue?
>
> LOOP_VINFO_ORIG_LOOP_INFO is used for several things:
>  - mark this loop as epilogue
>  - get VF of original loop (required for both mask and nomask modes)
>  - get decision about epilogue masking
>
> That's all.  When epilogue is created it has no LOOP_VINFO.  Also when we
> vectorize loop we create and destroy its LOOP_VINFO multiple times.  When
> loop has LOOP_VINFO loop->aux points to it and original LOOP_VINFO is in
> LOOP_VINFO_ORIG_LOOP_INFO.  When Loop has no LOOP_VINFO associated I have no
> place to bind it with the original loop and therefore I use vacant loop->aux
> for that.  Any other way to bind epilogue with its original loop would work
> as well.  I just chose loop->aux to avoid new fields and data structures.
I was starting to draw the conclusion that the smuggling in the aux 
field was for cases when there was no LOOP_VINFO.  But was rather late 
at night and I didn't follow that idea through the code.  THanks for 
clarifying.


>>
>> And something just occurred to me -- is there some inherent reason why SLP
>> doesn't vectorize the epilogue, particularly for the cases where we can
>> vectorize the epilogue using smaller vectors?  Sorry if you've already
>> answered this somewhere or it's a dumb question.
>
> IIUC this may happen only if we unroll epilogue into a single BB which happens
> only when epilogue iterations count is known. Right?
Probably.  The need to make sure the epilogue is unrolled probably makes 
this a non-starter.

I have a soft spot for SLP as I stumbled on the idea while rewriting a 
presentation in the wee hours of the morning for the next day. 
Essentially it was a "poor man's" vectorizer that could be done for 
dramatically less engineering cost than a traditional vectorizer.  The 
MIT paper outlining the same ideas came out a couple years later...


>> +       /* Add new loop to a processing queue.  To make it easier
>>> +          to match loop and its epilogue vectorization in dumps
>>> +          put new loop as the next loop to process.  */
>>> +       if (new_loop)
>>> +         {
>>> +           loops.safe_insert (i + 1, new_loop->num);
>>> +           vect_loops_num = number_of_loops (cfun);
>>> +         }
>>> +
>>
>> So just to be clear, the only reason to do this is for dumps -- other than
>> processing the loop before it's epilogue, there's no other inherently
>> necessary ordering of the loops, right?
>
> Right, I don't see other reasons to do it.
Perfect.  Thanks for confirming.

jeff
Richard Biener July 15, 2016, 10:28 a.m. UTC | #12
On Thu, Jun 16, 2016 at 8:00 AM, Jeff Law <law@redhat.com> wrote:
> On 05/19/2016 01:39 PM, Ilya Enkovich wrote:
>>
>> Hi,
>>
>> This patch introduces changes required to run vectorizer on loop epilogue.
>> This also enables epilogue vectorization using a vector of smaller size.
>>
>> Thanks,
>> Ilya
>> --
>> gcc/
>>
>> 2016-05-19  Ilya Enkovich  <ilya.enkovich@intel.com>
>>
>>         * tree-if-conv.c (tree_if_conversion): Make public.
>>         * tree-if-conv.h: New file.
>>         * tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Don't
>>         try to enhance alignment for epilogues.
>>         * tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Return
>>         created loop.
>>         * tree-vect-loop.c: include tree-if-conv.h.
>>         (destroy_loop_vec_info): Preserve LOOP_VINFO_ORIG_LOOP_INFO in
>>         loop->aux.
>>         (vect_analyze_loop_form): Init LOOP_VINFO_ORIG_LOOP_INFO and reset
>>         loop->aux.
>>         (vect_analyze_loop): Reset loop->aux.
>>         (vect_transform_loop): Check if created epilogue should be
>> returned
>>         for further vectorization.  If-convert epilogue if required.
>>         * tree-vectorizer.c (vectorize_loops): Add a queue of loops to
>>         process and insert vectorized loop epilogues into this queue.
>>         * tree-vectorizer.h (vect_do_peeling_for_loop_bound): Return
>> created
>>         loop.
>>         (vect_transform_loop): Return created loop.
>
> As Richi noted, the additional calls into the if-converter are unfortunate.
> I'm not sure how else to avoid them though.  It looks like we can run
> if-conversion on just the epilogue, so maybe that's not too bad.

We could use the if-converted loop as source when doing the loop copy
for the epilogue...  (and do it similar to if-conversion when it inserts a
__builtin_vectorized_loop () check, that is, create two versions for
the epilogue).

>> @@ -1212,8 +1213,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo,
>> bool clean_stmts)
>>    destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
>>    loop_vinfo->scalar_cost_vec.release ();
>>
>> +  loop->aux = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
>>    free (loop_vinfo);
>> -  loop->aux = NULL;
>>  }
>
> Hmm, there seems to be a level of indirection I'm missing here.  We're
> smuggling LOOP_VINFO_ORIG_LOOP_INFO around in loop->aux.  Ewww.  I thought
> the whole point of LOOP_VINFO_ORIG_LOOP_INFO was to smuggle the VINFO from
> the original loop to the vectorized epilogue.  What am I missing?  Rather
> than smuggling around in the aux field, is there some inherent reason why we
> can't just copy the info from the original loop directly into
> LOOP_VINFO_ORIG_LOOP_INFO for the vectorized epilogue?
>
>> +  /* FORNOW: Currently alias checks are not inherited for epilogues.
>> +     Don't try to vectorize epilogue because it will require
>> +     additional alias checks.  */
>
> Are the alias checks here redundant with the ones done for the original
> loop?  If so won't DOM eliminate them?

They are too complex for this.  But the epilogue could be annotated with ivdep
pragma / safelen in some way?

> And something just occurred to me -- is there some inherent reason why SLP
> doesn't vectorize the epilogue, particularly for the cases where we can
> vectorize the epilogue using smaller vectors?  Sorry if you've already
> answered this somewhere or it's a dumb question.

It usually can but only if we unroll the epilogue later (and thus when the
number of iterations is known at compile-time).

>
>
>>
>> +       /* Add new loop to a processing queue.  To make it easier
>> +          to match loop and its epilogue vectorization in dumps
>> +          put new loop as the next loop to process.  */
>> +       if (new_loop)
>> +         {
>> +           loops.safe_insert (i + 1, new_loop->num);
>> +           vect_loops_num = number_of_loops (cfun);
>> +         }
>> +
>
> So just to be clear, the only reason to do this is for dumps -- other than
> processing the loop before it's epilogue, there's no other inherently
> necessary ordering of the loops, right?
>
>
> Jeff
Richard Biener July 15, 2016, 10:29 a.m. UTC | #13
On Fri, Jun 17, 2016 at 4:33 PM, Ilya Enkovich <enkovich.gnu@gmail.com> wrote:
> 2016-06-16 9:00 GMT+03:00 Jeff Law <law@redhat.com>:
>> On 05/19/2016 01:39 PM, Ilya Enkovich wrote:
>>>
>>> Hi,
>>>
>>> This patch introduces changes required to run vectorizer on loop epilogue.
>>> This also enables epilogue vectorization using a vector of smaller size.
>>>
>>> Thanks,
>>> Ilya
>>> --
>>> gcc/
>>>
>>> 2016-05-19  Ilya Enkovich  <ilya.enkovich@intel.com>
>>>
>>>         * tree-if-conv.c (tree_if_conversion): Make public.
>>>         * tree-if-conv.h: New file.
>>>         * tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Don't
>>>         try to enhance alignment for epilogues.
>>>         * tree-vect-loop-manip.c (vect_do_peeling_for_loop_bound): Return
>>>         created loop.
>>>         * tree-vect-loop.c: include tree-if-conv.h.
>>>         (destroy_loop_vec_info): Preserve LOOP_VINFO_ORIG_LOOP_INFO in
>>>         loop->aux.
>>>         (vect_analyze_loop_form): Init LOOP_VINFO_ORIG_LOOP_INFO and reset
>>>         loop->aux.
>>>         (vect_analyze_loop): Reset loop->aux.
>>>         (vect_transform_loop): Check if created epilogue should be
>>> returned
>>>         for further vectorization.  If-convert epilogue if required.
>>>         * tree-vectorizer.c (vectorize_loops): Add a queue of loops to
>>>         process and insert vectorized loop epilogues into this queue.
>>>         * tree-vectorizer.h (vect_do_peeling_for_loop_bound): Return
>>> created
>>>         loop.
>>>         (vect_transform_loop): Return created loop.
>>
>> As Richi noted, the additional calls into the if-converter are unfortunate.
>> I'm not sure how else to avoid them though.  It looks like we can run
>> if-conversion on just the epilogue, so maybe that's not too bad.
>>
>>
>>> @@ -1212,8 +1213,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo,
>>> bool clean_stmts)
>>>    destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
>>>    loop_vinfo->scalar_cost_vec.release ();
>>>
>>> +  loop->aux = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
>>>    free (loop_vinfo);
>>> -  loop->aux = NULL;
>>>  }
>>
>> Hmm, there seems to be a level of indirection I'm missing here.  We're
>> smuggling LOOP_VINFO_ORIG_LOOP_INFO around in loop->aux.  Ewww.  I thought
>> the whole point of LOOP_VINFO_ORIG_LOOP_INFO was to smuggle the VINFO from
>> the original loop to the vectorized epilogue.  What am I missing?  Rather
>> than smuggling around in the aux field, is there some inherent reason why we
>> can't just copy the info from the original loop directly into
>> LOOP_VINFO_ORIG_LOOP_INFO for the vectorized epilogue?
>
> LOOP_VINFO_ORIG_LOOP_INFO is used for several things:
>  - mark this loop as epilogue
>  - get VF of original loop (required for both mask and nomask modes)
>  - get decision about epilogue masking
>
> That's all.  When epilogue is created it has no LOOP_VINFO.  Also when we
> vectorize loop we create and destroy its LOOP_VINFO multiple times.  When
> loop has LOOP_VINFO loop->aux points to it and original LOOP_VINFO is in
> LOOP_VINFO_ORIG_LOOP_INFO.  When Loop has no LOOP_VINFO associated I have no
> place to bind it with the original loop and therefore I use vacant loop->aux
> for that.  Any other way to bind epilogue with its original loop would work
> as well.  I just chose loop->aux to avoid new fields and data structures.

Maybe simply changing the way the vectorizer iterates over loops like
re-cursing on the generated epilogue and passing down its origin.

>>
>>> +  /* FORNOW: Currently alias checks are not inherited for epilogues.
>>> +     Don't try to vectorize epilogue because it will require
>>> +     additional alias checks.  */
>>
>> Are the alias checks here redundant with the ones done for the original
>> loop?  If so won't DOM eliminate them?
>
> I revisited this part recently and thought it should actually be safe to
> assume we have no aliasing in epilogue because we are dominated by alias
> checks of the original loop.  So I prepared a patch to remove this restriction
> and avoid alias checks generation for epilogues (so we compute aliases checks
> required but don't emit them).  I didn't send this patch yet.
> Do you think it is a valid assumption?
>
>>
>>
>> And something just occurred to me -- is there some inherent reason why SLP
>> doesn't vectorize the epilogue, particularly for the cases where we can
>> vectorize the epilogue using smaller vectors?  Sorry if you've already
>> answered this somewhere or it's a dumb question.
>
> IIUC this may happen only if we unroll epilogue into a single BB which happens
> only when epilogue iterations count is known. Right?
>
>>
>>
>>
>>>
>>> +       /* Add new loop to a processing queue.  To make it easier
>>> +          to match loop and its epilogue vectorization in dumps
>>> +          put new loop as the next loop to process.  */
>>> +       if (new_loop)
>>> +         {
>>> +           loops.safe_insert (i + 1, new_loop->num);
>>> +           vect_loops_num = number_of_loops (cfun);
>>> +         }
>>> +
>>
>> So just to be clear, the only reason to do this is for dumps -- other than
>> processing the loop before it's epilogue, there's no other inherently
>> necessary ordering of the loops, right?
>
> Right, I don't see other reasons to do it.
>
> Thanks,
> Ilya
>
>>
>>
>> Jeff
diff mbox

Patch

diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index c38e21b..41b6c99 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -2801,7 +2801,7 @@  ifcvt_local_dce (basic_block bb)
    profitability analysis.  Returns non-zero todo flags when something
    changed.  */
 
-static unsigned int
+unsigned int
 tree_if_conversion (struct loop *loop)
 {
   unsigned int todo = 0;
diff --git a/gcc/tree-if-conv.h b/gcc/tree-if-conv.h
new file mode 100644
index 0000000..3a732c2
--- /dev/null
+++ b/gcc/tree-if-conv.h
@@ -0,0 +1,24 @@ 
+/* Copyright (C) 2016 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_TREE_IF_CONV_H
+#define GCC_TREE_IF_CONV_H
+
+unsigned int tree_if_conversion (struct loop *);
+
+#endif  /* GCC_TREE_IF_CONV_H  */
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 7652e21..f275933 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -1595,7 +1595,10 @@  vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
   /* Check if we can possibly peel the loop.  */
   if (!vect_can_advance_ivs_p (loop_vinfo)
       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
-      || loop->inner)
+      || loop->inner
+      /* Required peeling was performed in prologue and
+	 is not required for epilogue.  */
+      || LOOP_VINFO_EPILOGUE_P (loop_vinfo))
     do_peeling = false;
 
   if (do_peeling
@@ -1875,7 +1878,10 @@  vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 
   do_versioning =
 	optimize_loop_nest_for_speed_p (loop)
-	&& (!loop->inner); /* FORNOW */
+	&& (!loop->inner) /* FORNOW */
+        /* Required versioning was performed for the
+	   original loop and is not required for epilogue.  */
+	&& !LOOP_VINFO_EPILOGUE_P (loop_vinfo);
 
   if (do_versioning)
     {
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 7ec6dae..fab5879 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -1742,9 +1742,11 @@  vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
    NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO).
 
    COND_EXPR and COND_EXPR_STMT_LIST are combined with a new generated
-   test.  */
+   test.
 
-void
+   Return created loop.  */
+
+struct loop *
 vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo,
 				tree ni_name, tree ratio_mult_vf_name,
 				unsigned int th, bool check_profitability)
@@ -1812,6 +1814,8 @@  vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo,
   scev_reset ();
 
   free_original_copy_tables ();
+
+  return new_loop;
 }
 
 
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index aac0df9..a537ef4 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -47,6 +47,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "tree-vectorizer.h"
 #include "gimple-fold.h"
 #include "cgraph.h"
+#include "tree-if-conv.h"
 
 /* Loop Vectorization Pass.
 
@@ -1212,8 +1213,8 @@  destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
   loop_vinfo->scalar_cost_vec.release ();
 
+  loop->aux = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
   free (loop_vinfo);
-  loop->aux = NULL;
 }
 
 
@@ -1499,13 +1500,24 @@  vect_analyze_loop_form (struct loop *loop)
 
   if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
 				  &number_of_iterations, &inner_loop_cond))
-    return NULL;
+    {
+      loop->aux = NULL;
+      return NULL;
+    }
 
   loop_vec_info loop_vinfo = new_loop_vec_info (loop);
   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
 
+  /* For epilogues we want to vectorize aux holds
+     loop_vec_info of the original loop.  */
+  if (loop->aux)
+    {
+      gcc_assert (LOOP_VINFO_VECTORIZABLE_P ((loop_vec_info)loop->aux));
+      LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = (loop_vec_info)loop->aux;
+    }
+
   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     {
       if (dump_enabled_p ())
@@ -1522,7 +1534,6 @@  vect_analyze_loop_form (struct loop *loop)
     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
       = loop_exit_ctrl_vec_info_type;
 
-  gcc_assert (!loop->aux);
   loop->aux = loop_vinfo;
   return loop_vinfo;
 }
@@ -2280,7 +2291,10 @@  vect_analyze_loop (struct loop *loop)
       if (fatal
 	  || vector_sizes == 0
 	  || current_vector_size == 0)
-	return NULL;
+	{
+	  loop->aux = NULL;
+	  return NULL;
+	}
 
       /* Try the next biggest vector size.  */
       current_vector_size = 1 << floor_log2 (vector_sizes);
@@ -6576,10 +6590,11 @@  vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
    Vectorize the loop - created vectorized stmts to replace the scalar
    stmts in the loop, and update the loop exit condition.  */
 
-void
+struct loop *
 vect_transform_loop (loop_vec_info loop_vinfo)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  struct loop *epilogue = NULL;
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   int nbbs = loop->num_nodes;
   int i;
@@ -6661,8 +6676,9 @@  vect_transform_loop (loop_vec_info loop_vinfo)
 	ni_name = vect_build_loop_niters (loop_vinfo);
       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
 				       &ratio);
-      vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
-				      th, check_profitability);
+      epilogue = vect_do_peeling_for_loop_bound (loop_vinfo, ni_name,
+						 ratio_mult_vf, th,
+						 check_profitability);
     }
   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
@@ -6959,6 +6975,64 @@  vect_transform_loop (loop_vec_info loop_vinfo)
   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
     vect_free_slp_instance (instance);
   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
+
+  /* Don't vectorize epilogue for epilogue.  */
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+    epilogue = NULL;
+  /* Scalar epilogue is not vectorized in case
+     we use combined vector epilogue.  */
+  else if (LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo))
+    epilogue = NULL;
+  /* FORNOW: Currently alias checks are not inherited for epilogues.
+     Don't try to vectorize epilogue because it will require
+     additional alias checks.  */
+  else if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
+    epilogue = NULL;
+
+  if (epilogue)
+    {
+      if (!LOOP_VINFO_MASK_EPILOGUE (loop_vinfo))
+	{
+	  unsigned int vector_sizes
+	    = targetm.vectorize.autovectorize_vector_sizes ();
+	  vector_sizes &= current_vector_size - 1;
+
+	  if (!(flag_tree_vectorize_epilogues & VECT_EPILOGUE_NOMASK))
+	    epilogue = NULL;
+	  else if (!vector_sizes)
+	    epilogue = NULL;
+	  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+		   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+	    {
+	      int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
+	      int ratio = current_vector_size / smallest_vec_size;
+	      int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
+		- LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+	      eiters = eiters % vectorization_factor;
+
+	      epilogue->nb_iterations_upper_bound = eiters - 1;
+
+	      if (eiters < vectorization_factor / ratio)
+		epilogue = NULL;
+	    }
+	}
+    }
+
+  if (epilogue)
+    {
+      epilogue->force_vectorize = loop->force_vectorize;
+      epilogue->safelen = loop->safelen;
+      epilogue->dont_vectorize = false;
+
+      /* We may need to if-convert epilogue to vectorize it.  */
+      if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
+	tree_if_conversion (epilogue);
+
+      gcc_assert (!epilogue->aux);
+      epilogue->aux = loop_vinfo;
+    }
+
+  return epilogue;
 }
 
 /* The code below is trying to perform simple optimization - revert
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 2b25b45..5f15246 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -491,14 +491,16 @@  vectorize_loops (void)
 {
   unsigned int i;
   unsigned int num_vectorized_loops = 0;
-  unsigned int vect_loops_num;
+  unsigned int vect_loops_num = number_of_loops (cfun);
   struct loop *loop;
   hash_table<simduid_to_vf> *simduid_to_vf_htab = NULL;
   hash_table<simd_array_to_simduid> *simd_array_to_simduid_htab = NULL;
   bool any_ifcvt_loops = false;
   unsigned ret = 0;
+  auto_vec<unsigned int> loops (vect_loops_num);
 
-  vect_loops_num = number_of_loops (cfun);
+  FOR_EACH_LOOP (loop, 0)
+    loops.quick_push (loop->num);
 
   /* Bail out if there are no loops.  */
   if (vect_loops_num <= 1)
@@ -514,14 +516,18 @@  vectorize_loops (void)
   /* If some loop was duplicated, it gets bigger number
      than all previously defined loops.  This fact allows us to run
      only over initial loops skipping newly generated ones.  */
-  FOR_EACH_LOOP (loop, 0)
-    if (loop->dont_vectorize)
+  for (i = 0; i < loops.length (); i++)
+    if (!(loop = get_loop (cfun, loops[i])))
+      continue;
+    else if (loop->dont_vectorize)
       any_ifcvt_loops = true;
     else if ((flag_tree_loop_vectorize
-	      && optimize_loop_nest_for_speed_p (loop))
+	      && (optimize_loop_nest_for_speed_p (loop)
+		  || loop->aux))
 	     || loop->force_vectorize)
       {
 	loop_vec_info loop_vinfo;
+	struct loop *new_loop;
 	vect_location = find_loop_location (loop);
         if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
 	    && dump_enabled_p ())
@@ -551,12 +557,21 @@  vectorize_loops (void)
 	    && dump_enabled_p ())
           dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
                            "loop vectorized\n");
-	vect_transform_loop (loop_vinfo);
+	new_loop = vect_transform_loop (loop_vinfo);
 	num_vectorized_loops++;
 	/* Now that the loop has been vectorized, allow it to be unrolled
 	   etc.  */
 	loop->force_vectorize = false;
 
+	/* Add new loop to a processing queue.  To make it easier
+	   to match loop and its epilogue vectorization in dumps
+	   put new loop as the next loop to process.  */
+	if (new_loop)
+	  {
+	    loops.safe_insert (i + 1, new_loop->num);
+	    vect_loops_num = number_of_loops (cfun);
+	  }
+
 	if (loop->simduid)
 	  {
 	    simduid_to_vf *simduid_to_vf_data = XNEW (simduid_to_vf);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 4c19317..b269752 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -984,8 +984,8 @@  extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge);
 struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *,
 						     struct loop *, edge);
 extern void vect_loop_versioning (loop_vec_info, unsigned int, bool);
-extern void vect_do_peeling_for_loop_bound (loop_vec_info, tree, tree,
-					    unsigned int, bool);
+extern struct loop *vect_do_peeling_for_loop_bound (loop_vec_info, tree, tree,
+						    unsigned int, bool);
 extern void vect_do_peeling_for_alignment (loop_vec_info, tree,
 					   unsigned int, bool);
 extern source_location find_loop_location (struct loop *);
@@ -1101,7 +1101,7 @@  extern gimple *vect_force_simple_reduction (loop_vec_info, gimple *, bool,
 /* Drive for loop analysis stage.  */
 extern loop_vec_info vect_analyze_loop (struct loop *);
 /* Drive for loop transformation stage.  */
-extern void vect_transform_loop (loop_vec_info);
+extern struct loop *vect_transform_loop (loop_vec_info);
 extern loop_vec_info vect_analyze_loop_form (struct loop *);
 extern bool vectorizable_live_operation (gimple *, gimple_stmt_iterator *,
 					 gimple **);