diff mbox

[6/7] Explicitly classify vector loads and stores

Message ID 87lh26ltp4.fsf@e105548-lin.cambridge.arm.com
State New
Headers show

Commit Message

Richard Sandiford June 15, 2016, 8:52 a.m. UTC
This is the main patch in the series.  It adds a new enum and routines
for classifying a vector load or store implementation.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Thanks,
Richard


gcc/
	* tree-vectorizer.h (vect_memory_access_type): New enum.
	(_stmt_vec_info): Add a memory_access_type field.
	(STMT_VINFO_MEMORY_ACCESS_TYPE): New macro.
	(vect_model_store_cost): Take an access type instead of a boolean.
	(vect_model_load_cost): Likewise.
	* tree-vect-slp.c (vect_analyze_slp_cost_1): Update calls to
	vect_model_store_cost and vect_model_load_cost.
	* tree-vect-stmts.c (vec_load_store_type): New enum.
	(vect_model_store_cost): Take an access type instead of a
	store_lanes_p boolean.  Simplify tests.
	(vect_model_load_cost): Likewise, but for load_lanes_p.
	(get_group_load_store_type, get_load_store_type): New functions.
	(vectorizable_store): Use get_load_store_type.  Record the access
	type in STMT_VINFO_MEMORY_ACCESS_TYPE.
	(vectorizable_load): Likewise.
	(vectorizable_mask_load_store): Likewise.  Replace is_store
	variable with vls_type.

Comments

Jeff Law June 21, 2016, 10:35 p.m. UTC | #1
On 06/15/2016 02:52 AM, Richard Sandiford wrote:
> This is the main patch in the series.  It adds a new enum and routines
> for classifying a vector load or store implementation.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
>
> Thanks,
> Richard
>
>
> gcc/
> 	* tree-vectorizer.h (vect_memory_access_type): New enum.
> 	(_stmt_vec_info): Add a memory_access_type field.
> 	(STMT_VINFO_MEMORY_ACCESS_TYPE): New macro.
> 	(vect_model_store_cost): Take an access type instead of a boolean.
> 	(vect_model_load_cost): Likewise.
> 	* tree-vect-slp.c (vect_analyze_slp_cost_1): Update calls to
> 	vect_model_store_cost and vect_model_load_cost.
> 	* tree-vect-stmts.c (vec_load_store_type): New enum.
> 	(vect_model_store_cost): Take an access type instead of a
> 	store_lanes_p boolean.  Simplify tests.
> 	(vect_model_load_cost): Likewise, but for load_lanes_p.
> 	(get_group_load_store_type, get_load_store_type): New functions.
> 	(vectorizable_store): Use get_load_store_type.  Record the access
> 	type in STMT_VINFO_MEMORY_ACCESS_TYPE.
> 	(vectorizable_load): Likewise.
> 	(vectorizable_mask_load_store): Likewise.  Replace is_store
> 	variable with vls_type.
OK.  Looks like a nice cleanup to me.  If there's something that got 
goof'd along the way, I trust you'll deal with it appropriately -- I 
didn't try to map from every conditional back in the original code to 
the conditionals in the new code.


Jeff
Richard Biener July 1, 2016, 9:58 a.m. UTC | #2
On Wed, Jun 15, 2016 at 10:52 AM, Richard Sandiford
<richard.sandiford@arm.com> wrote:
> This is the main patch in the series.  It adds a new enum and routines
> for classifying a vector load or store implementation.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Why's the setting and checking of the memory access type conditional on !slp?
I'd rather avoid doing this :/

Otherwise it looks like a step in the right direction of splitting the
vectorizable_*
functions into a analysis part that records all decisions made and a transform
part that just applies it.

Thanks,
Richard.

> Thanks,
> Richard
>
>
> gcc/
>         * tree-vectorizer.h (vect_memory_access_type): New enum.
>         (_stmt_vec_info): Add a memory_access_type field.
>         (STMT_VINFO_MEMORY_ACCESS_TYPE): New macro.
>         (vect_model_store_cost): Take an access type instead of a boolean.
>         (vect_model_load_cost): Likewise.
>         * tree-vect-slp.c (vect_analyze_slp_cost_1): Update calls to
>         vect_model_store_cost and vect_model_load_cost.
>         * tree-vect-stmts.c (vec_load_store_type): New enum.
>         (vect_model_store_cost): Take an access type instead of a
>         store_lanes_p boolean.  Simplify tests.
>         (vect_model_load_cost): Likewise, but for load_lanes_p.
>         (get_group_load_store_type, get_load_store_type): New functions.
>         (vectorizable_store): Use get_load_store_type.  Record the access
>         type in STMT_VINFO_MEMORY_ACCESS_TYPE.
>         (vectorizable_load): Likewise.
>         (vectorizable_mask_load_store): Likewise.  Replace is_store
>         variable with vls_type.
>
> Index: gcc/tree-vectorizer.h
> ===================================================================
> --- gcc/tree-vectorizer.h
> +++ gcc/tree-vectorizer.h
> @@ -485,6 +485,33 @@ enum slp_vect_type {
>    hybrid
>  };
>
> +/* Describes how we're going to vectorize an individual load or store,
> +   or a group of loads or stores.  */
> +enum vect_memory_access_type {
> +  /* A simple contiguous access.  */
> +  VMAT_CONTIGUOUS,
> +
> +  /* A simple contiguous access in which the elements need to be permuted
> +     after loading or before storing.  Only used for loop vectorization;
> +     SLP uses separate permutes.  */
> +  VMAT_CONTIGUOUS_PERMUTE,
> +
> +  /* An access that uses IFN_LOAD_LANES or IFN_STORE_LANES.  */
> +  VMAT_LOAD_STORE_LANES,
> +
> +  /* An access in which each scalar element is loaded or stored
> +     individually.  */
> +  VMAT_ELEMENTWISE,
> +
> +  /* A hybrid of VMAT_CONTIGUOUS and VMAT_ELEMENTWISE, used for grouped
> +     SLP accesses.  Each unrolled iteration uses a contiguous load
> +     or store for the whole group, but the groups from separate iterations
> +     are combined in the same way as for VMAT_ELEMENTWISE.  */
> +  VMAT_STRIDED_SLP,
> +
> +  /* The access uses gather loads or scatter stores.  */
> +  VMAT_GATHER_SCATTER
> +};
>
>  typedef struct data_reference *dr_p;
>
> @@ -602,6 +629,10 @@ typedef struct _stmt_vec_info {
>    /* True if this is an access with loop-invariant stride.  */
>    bool strided_p;
>
> +  /* Classifies how the load or store is going to be implemented
> +     for loop vectorization.  */
> +  vect_memory_access_type memory_access_type;
> +
>    /* For both loads and stores.  */
>    bool simd_lane_access_p;
>
> @@ -659,6 +690,7 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo)
>  #define STMT_VINFO_DATA_REF(S)             (S)->data_ref_info
>  #define STMT_VINFO_GATHER_SCATTER_P(S)    (S)->gather_scatter_p
>  #define STMT_VINFO_STRIDED_P(S)                   (S)->strided_p
> +#define STMT_VINFO_MEMORY_ACCESS_TYPE(S)   (S)->memory_access_type
>  #define STMT_VINFO_SIMD_LANE_ACCESS_P(S)   (S)->simd_lane_access_p
>  #define STMT_VINFO_VEC_REDUCTION_TYPE(S)   (S)->v_reduc_type
>
> @@ -1006,12 +1038,12 @@ extern void free_stmt_vec_info (gimple *stmt);
>  extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *,
>                                      stmt_vector_for_cost *,
>                                     stmt_vector_for_cost *);
> -extern void vect_model_store_cost (stmt_vec_info, int, bool,
> +extern void vect_model_store_cost (stmt_vec_info, int, vect_memory_access_type,
>                                    enum vect_def_type, slp_tree,
>                                    stmt_vector_for_cost *,
>                                    stmt_vector_for_cost *);
> -extern void vect_model_load_cost (stmt_vec_info, int, bool, slp_tree,
> -                                 stmt_vector_for_cost *,
> +extern void vect_model_load_cost (stmt_vec_info, int, vect_memory_access_type,
> +                                 slp_tree, stmt_vector_for_cost *,
>                                   stmt_vector_for_cost *);
>  extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
>                                   enum vect_cost_for_stmt, stmt_vec_info,
> Index: gcc/tree-vect-slp.c
> ===================================================================
> --- gcc/tree-vect-slp.c
> +++ gcc/tree-vect-slp.c
> @@ -1490,9 +1490,13 @@ vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
>    stmt_info = vinfo_for_stmt (stmt);
>    if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
>      {
> +      vect_memory_access_type memory_access_type
> +       = (STMT_VINFO_STRIDED_P (stmt_info)
> +          ? VMAT_STRIDED_SLP
> +          : VMAT_CONTIGUOUS);
>        if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
> -       vect_model_store_cost (stmt_info, ncopies_for_cost, false,
> -                              vect_uninitialized_def,
> +       vect_model_store_cost (stmt_info, ncopies_for_cost,
> +                              memory_access_type, vect_uninitialized_def,
>                                node, prologue_cost_vec, body_cost_vec);
>        else
>         {
> @@ -1515,8 +1519,9 @@ vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
>               ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance);
>             }
>           /* Record the cost for the vector loads.  */
> -         vect_model_load_cost (stmt_info, ncopies_for_cost, false,
> -                               node, prologue_cost_vec, body_cost_vec);
> +         vect_model_load_cost (stmt_info, ncopies_for_cost,
> +                               memory_access_type, node, prologue_cost_vec,
> +                               body_cost_vec);
>           return;
>         }
>      }
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c
> +++ gcc/tree-vect-stmts.c
> @@ -52,6 +52,14 @@ along with GCC; see the file COPYING3.  If not see
>  /* For lang_hooks.types.type_for_mode.  */
>  #include "langhooks.h"
>
> +/* Says whether a statement is a load, a store of a vectorized statement
> +   result, or a store of an invariant value.  */
> +enum vec_load_store_type {
> +  VLS_LOAD,
> +  VLS_STORE,
> +  VLS_STORE_INVARIANT
> +};
> +
>  /* Return the vectorized type for the given statement.  */
>
>  tree
> @@ -873,8 +881,8 @@ vect_model_promotion_demotion_cost (stmt_vec_info stmt_info,
>
>  void
>  vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
> -                      bool store_lanes_p, enum vect_def_type dt,
> -                      slp_tree slp_node,
> +                      vect_memory_access_type memory_access_type,
> +                      enum vect_def_type dt, slp_tree slp_node,
>                        stmt_vector_for_cost *prologue_cost_vec,
>                        stmt_vector_for_cost *body_cost_vec)
>  {
> @@ -903,14 +911,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
>    /* We assume that the cost of a single store-lanes instruction is
>       equivalent to the cost of GROUP_SIZE separate stores.  If a grouped
>       access is instead being provided by a permute-and-store operation,
> -     include the cost of the permutes.
> -
> -     For SLP, the caller has already counted the permutation, if any.  */
> -  if (grouped_access_p
> -      && first_stmt_p
> -      && !store_lanes_p
> -      && !STMT_VINFO_STRIDED_P (stmt_info)
> -      && !slp_node)
> +     include the cost of the permutes.  */
> +  if (first_stmt_p
> +      && memory_access_type == VMAT_CONTIGUOUS_PERMUTE)
>      {
>        /* Uses a high and low interleave or shuffle operations for each
>          needed permute.  */
> @@ -927,17 +930,16 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
>
>    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>    /* Costs of the stores.  */
> -  if (STMT_VINFO_STRIDED_P (stmt_info) && !(slp_node && grouped_access_p))
> -    {
> -      /* N scalar stores plus extracting the elements.  */
> -      inside_cost += record_stmt_cost (body_cost_vec,
> -                                      ncopies * TYPE_VECTOR_SUBPARTS (vectype),
> -                                      scalar_store, stmt_info, 0, vect_body);
> -    }
> +  if (memory_access_type == VMAT_ELEMENTWISE)
> +    /* N scalar stores plus extracting the elements.  */
> +    inside_cost += record_stmt_cost (body_cost_vec,
> +                                    ncopies * TYPE_VECTOR_SUBPARTS (vectype),
> +                                    scalar_store, stmt_info, 0, vect_body);
>    else
>      vect_get_store_cost (dr, ncopies, &inside_cost, body_cost_vec);
>
> -  if (STMT_VINFO_STRIDED_P (stmt_info))
> +  if (memory_access_type == VMAT_ELEMENTWISE
> +      || memory_access_type == VMAT_STRIDED_SLP)
>      inside_cost += record_stmt_cost (body_cost_vec,
>                                      ncopies * TYPE_VECTOR_SUBPARTS (vectype),
>                                      vec_to_scalar, stmt_info, 0, vect_body);
> @@ -1011,7 +1013,8 @@ vect_get_store_cost (struct data_reference *dr, int ncopies,
>
>  void
>  vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
> -                     bool load_lanes_p, slp_tree slp_node,
> +                     vect_memory_access_type memory_access_type,
> +                     slp_tree slp_node,
>                       stmt_vector_for_cost *prologue_cost_vec,
>                       stmt_vector_for_cost *body_cost_vec)
>  {
> @@ -1036,14 +1039,9 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
>    /* We assume that the cost of a single load-lanes instruction is
>       equivalent to the cost of GROUP_SIZE separate loads.  If a grouped
>       access is instead being provided by a load-and-permute operation,
> -     include the cost of the permutes.
> -
> -     For SLP, the caller has already counted the permutation, if any.  */
> -  if (grouped_access_p
> -      && first_stmt_p
> -      && !load_lanes_p
> -      && !STMT_VINFO_STRIDED_P (stmt_info)
> -      && !slp_node)
> +     include the cost of the permutes.  */
> +  if (first_stmt_p
> +      && memory_access_type == VMAT_CONTIGUOUS_PERMUTE)
>      {
>        /* Uses an even and odd extract operations or shuffle operations
>          for each needed permute.  */
> @@ -1059,7 +1057,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
>      }
>
>    /* The loads themselves.  */
> -  if (STMT_VINFO_STRIDED_P (stmt_info) && !(slp_node && grouped_access_p))
> +  if (memory_access_type == VMAT_ELEMENTWISE)
>      {
>        /* N scalar loads plus gathering them into a vector.  */
>        tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> @@ -1071,7 +1069,8 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
>      vect_get_load_cost (dr, ncopies, first_stmt_p,
>                         &inside_cost, &prologue_cost,
>                         prologue_cost_vec, body_cost_vec, true);
> -  if (STMT_VINFO_STRIDED_P (stmt_info))
> +  if (memory_access_type == VMAT_ELEMENTWISE
> +      || memory_access_type == VMAT_STRIDED_SLP)
>      inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_construct,
>                                      stmt_info, 0, vect_body);
>
> @@ -1674,6 +1673,209 @@ static tree permute_vec_elements (tree, tree, tree, gimple *,
>                                   gimple_stmt_iterator *);
>
>
> +/* A subroutine of get_load_store_type, with a subset of the same
> +   arguments.  Handle the case where STMT is part of a grouped load
> +   or store.
> +
> +   For stores, the statements in the group are all consecutive
> +   and there is no gap at the end.  For loads, the statements in the
> +   group might not be consecutive; there can be gaps between statements
> +   as well as at the end.  */
> +
> +static bool
> +get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
> +                          vec_load_store_type vls_type,
> +                          vect_memory_access_type *memory_access_type)
> +{
> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> +  vec_info *vinfo = stmt_info->vinfo;
> +  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
> +  struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
> +  gimple *first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
> +  unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
> +  bool single_element_p = (stmt == first_stmt
> +                          && !GROUP_NEXT_ELEMENT (stmt_info));
> +  unsigned HOST_WIDE_INT gap = GROUP_GAP (vinfo_for_stmt (first_stmt));
> +  int nunits = TYPE_VECTOR_SUBPARTS (vectype);
> +
> +  /* True if the vectorized statements would access beyond the last
> +     statement in the group.  */
> +  bool overrun_p = false;
> +
> +  /* True if we can cope with such overrun by peeling for gaps, so that
> +     there is at least one final scalar iteration after the vector loop.  */
> +  bool can_overrun_p = (vls_type == VLS_LOAD && loop_vinfo && !loop->inner);
> +
> +  /* There can only be a gap at the end of the group if the stride is
> +     known at compile time.  */
> +  gcc_assert (!STMT_VINFO_STRIDED_P (stmt_info) || gap == 0);
> +
> +  /* Stores can't yet have gaps.  */
> +  gcc_assert (slp || vls_type == VLS_LOAD || gap == 0);
> +
> +  if (slp)
> +    {
> +      if (STMT_VINFO_STRIDED_P (stmt_info))
> +       {
> +         /* Try to use consecutive accesses of GROUP_SIZE elements,
> +            separated by the stride, until we have a complete vector.
> +            Fall back to scalar accesses if that isn't possible.  */
> +         if (nunits % group_size == 0)
> +           *memory_access_type = VMAT_STRIDED_SLP;
> +         else
> +           *memory_access_type = VMAT_ELEMENTWISE;
> +       }
> +      else
> +       {
> +         overrun_p = loop_vinfo && gap != 0;
> +         if (overrun_p && vls_type != VLS_LOAD)
> +           {
> +             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                              "Grouped store with gaps requires"
> +                              " non-consecutive accesses\n");
> +             return false;
> +           }
> +         if (overrun_p && !can_overrun_p)
> +           {
> +             if (dump_enabled_p ())
> +               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                "Peeling for outer loop is not supported\n");
> +             return false;
> +           }
> +         *memory_access_type = VMAT_CONTIGUOUS;
> +       }
> +    }
> +  else
> +    {
> +      /* We can always handle this case using elementwise accesses,
> +        but see if something more efficient is available.  */
> +      *memory_access_type = VMAT_ELEMENTWISE;
> +
> +      /* If there is a gap at the end of the group then these optimizations
> +        would access excess elements in the last iteration.  */
> +      bool would_overrun_p = (gap != 0);
> +      if (!STMT_VINFO_STRIDED_P (stmt_info)
> +         && (can_overrun_p || !would_overrun_p))
> +       {
> +         /* First try using LOAD/STORE_LANES.  */
> +         if (vls_type == VLS_LOAD
> +             ? vect_load_lanes_supported (vectype, group_size)
> +             : vect_store_lanes_supported (vectype, group_size))
> +           {
> +             *memory_access_type = VMAT_LOAD_STORE_LANES;
> +             overrun_p = would_overrun_p;
> +           }
> +
> +         /* If that fails, try using permuting loads.  */
> +         if (*memory_access_type == VMAT_ELEMENTWISE
> +             && (vls_type == VLS_LOAD
> +                 ? vect_grouped_load_supported (vectype, single_element_p,
> +                                                group_size)
> +                 : vect_grouped_store_supported (vectype, group_size)))
> +           {
> +             *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
> +             overrun_p = would_overrun_p;
> +           }
> +       }
> +    }
> +
> +  if (vls_type != VLS_LOAD && first_stmt == stmt)
> +    {
> +      /* STMT is the leader of the group. Check the operands of all the
> +        stmts of the group.  */
> +      gimple *next_stmt = GROUP_NEXT_ELEMENT (stmt_info);
> +      while (next_stmt)
> +       {
> +         gcc_assert (gimple_assign_single_p (next_stmt));
> +         tree op = gimple_assign_rhs1 (next_stmt);
> +         gimple *def_stmt;
> +         enum vect_def_type dt;
> +         if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt))
> +           {
> +             if (dump_enabled_p ())
> +               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                "use not simple.\n");
> +             return false;
> +           }
> +         next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
> +       }
> +    }
> +
> +  if (overrun_p)
> +    {
> +      gcc_assert (can_overrun_p);
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "Data access with gaps requires scalar "
> +                        "epilogue loop\n");
> +      LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
> +    }
> +
> +  return true;
> +}
> +
> +/* Analyze load or store statement STMT of type VLS_TYPE.  Return true
> +   if there is a memory access type that the vectorized form can use,
> +   storing it in *MEMORY_ACCESS_TYPE if so.  If we decide to use gathers
> +   or scatters, fill in GS_INFO accordingly.
> +
> +   SLP says whether we're performing SLP rather than loop vectorization.
> +   VECTYPE is the vector type that the vectorized statements will use.  */
> +
> +static bool
> +get_load_store_type (gimple *stmt, tree vectype, bool slp,
> +                    vec_load_store_type vls_type,
> +                    vect_memory_access_type *memory_access_type,
> +                    gather_scatter_info *gs_info)
> +{
> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> +  vec_info *vinfo = stmt_info->vinfo;
> +  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
> +  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +    {
> +      *memory_access_type = VMAT_GATHER_SCATTER;
> +      gimple *def_stmt;
> +      if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info))
> +       gcc_unreachable ();
> +      else if (!vect_is_simple_use (gs_info->offset, vinfo, &def_stmt,
> +                                   &gs_info->offset_dt,
> +                                   &gs_info->offset_vectype))
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "%s index use not simple.\n",
> +                            vls_type == VLS_LOAD ? "gather" : "scatter");
> +         return false;
> +       }
> +    }
> +  else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> +    {
> +      if (!get_group_load_store_type (stmt, vectype, slp, vls_type,
> +                                     memory_access_type))
> +       return false;
> +    }
> +  else if (STMT_VINFO_STRIDED_P (stmt_info))
> +    {
> +      gcc_assert (!slp);
> +      *memory_access_type = VMAT_ELEMENTWISE;
> +    }
> +  else
> +    *memory_access_type = VMAT_CONTIGUOUS;
> +
> +  /* FIXME: At the moment the cost model seems to underestimate the
> +     cost of using elementwise accesses.  This check preserves the
> +     traditional behavior until that can be fixed.  */
> +  if (*memory_access_type == VMAT_ELEMENTWISE
> +      && !STMT_VINFO_STRIDED_P (stmt_info))
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "not falling back to elementwise accesses\n");
> +      return false;
> +    }
> +  return true;
> +}
> +
>  /* Function vectorizable_mask_load_store.
>
>     Check if STMT performs a conditional load or store that can be vectorized.
> @@ -1705,7 +1907,7 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
>    int i, j;
>    bool inv_p;
>    gather_scatter_info gs_info;
> -  bool is_store;
> +  vec_load_store_type vls_type;
>    tree mask;
>    gimple *def_stmt;
>    enum vect_def_type dt;
> @@ -1716,7 +1918,6 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
>    ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
>    gcc_assert (ncopies >= 1);
>
> -  is_store = gimple_call_internal_fn (stmt) == IFN_MASK_STORE;
>    mask = gimple_call_arg (stmt, 2);
>
>    if (TREE_CODE (TREE_TYPE (mask)) != BOOLEAN_TYPE)
> @@ -1743,12 +1944,6 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
>
>    elem_type = TREE_TYPE (vectype);
>
> -  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> -    return false;
> -
> -  if (STMT_VINFO_STRIDED_P (stmt_info))
> -    return false;
> -
>    if (TREE_CODE (mask) != SSA_NAME)
>      return false;
>
> @@ -1762,27 +1957,26 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
>        || TYPE_VECTOR_SUBPARTS (mask_vectype) != TYPE_VECTOR_SUBPARTS (vectype))
>      return false;
>
> -  if (is_store)
> +  if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
>      {
>        tree rhs = gimple_call_arg (stmt, 3);
>        if (!vect_is_simple_use (rhs, loop_vinfo, &def_stmt, &dt, &rhs_vectype))
>         return false;
> +      if (dt == vect_constant_def || dt == vect_external_def)
> +       vls_type = VLS_STORE_INVARIANT;
> +      else
> +       vls_type = VLS_STORE;
>      }
> +  else
> +    vls_type = VLS_LOAD;
>
> -  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> -    {
> -      gimple *def_stmt;
> -      if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
> -       gcc_unreachable ();
> -      if (!vect_is_simple_use (gs_info.offset, loop_vinfo, &def_stmt,
> -                              &gs_info.offset_dt, &gs_info.offset_vectype))
> -       {
> -         if (dump_enabled_p ())
> -           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                            "gather index use not simple.");
> -         return false;
> -       }
> +  vect_memory_access_type memory_access_type;
> +  if (!get_load_store_type (stmt, vectype, false, vls_type,
> +                           &memory_access_type, &gs_info))
> +    return false;
>
> +  if (memory_access_type == VMAT_GATHER_SCATTER)
> +    {
>        tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
>        tree masktype
>         = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist))));
> @@ -1794,6 +1988,14 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
>           return false;
>         }
>      }
> +  else if (memory_access_type != VMAT_CONTIGUOUS)
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "unsupported access type for masked %s\n",
> +                        vls_type == VLS_LOAD ? "load" : "store");
> +      return false;
> +    }
>    else if (tree_int_cst_compare (nested_in_vect_loop
>                                  ? STMT_VINFO_DR_STEP (stmt_info)
>                                  : DR_STEP (dr), size_zero_node) <= 0)
> @@ -1801,25 +2003,28 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
>    else if (!VECTOR_MODE_P (TYPE_MODE (vectype))
>            || !can_vec_mask_load_store_p (TYPE_MODE (vectype),
>                                           TYPE_MODE (mask_vectype),
> -                                         !is_store)
> +                                         vls_type == VLS_LOAD)
>            || (rhs_vectype
>                && !useless_type_conversion_p (vectype, rhs_vectype)))
>      return false;
>
>    if (!vec_stmt) /* transformation not required.  */
>      {
> +      STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
>        STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
> -      if (is_store)
> -       vect_model_store_cost (stmt_info, ncopies, false, dt,
> -                              NULL, NULL, NULL);
> +      if (vls_type == VLS_LOAD)
> +       vect_model_load_cost (stmt_info, ncopies, memory_access_type,
> +                             NULL, NULL, NULL);
>        else
> -       vect_model_load_cost (stmt_info, ncopies, false, NULL, NULL, NULL);
> +       vect_model_store_cost (stmt_info, ncopies, memory_access_type,
> +                              dt, NULL, NULL, NULL);
>        return true;
>      }
> +  gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
>
>    /** Transform.  **/
>
> -  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +  if (memory_access_type == VMAT_GATHER_SCATTER)
>      {
>        tree vec_oprnd0 = NULL_TREE, op;
>        tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
> @@ -1993,7 +2198,7 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
>        gsi_replace (gsi, new_stmt, true);
>        return true;
>      }
> -  else if (is_store)
> +  else if (vls_type != VLS_LOAD)
>      {
>        tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
>        prev_stmt_info = NULL;
> @@ -2102,7 +2307,7 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
>         }
>      }
>
> -  if (!is_store)
> +  if (vls_type == VLS_LOAD)
>      {
>        /* Ensure that even with -fno-tree-dce the scalar MASK_LOAD is removed
>          from the IL.  */
> @@ -5188,9 +5393,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>    gimple *ptr_incr = NULL;
>    int ncopies;
>    int j;
> -  gimple *next_stmt, *first_stmt = NULL;
> -  bool grouped_store = false;
> -  bool store_lanes_p = false;
> +  gimple *next_stmt, *first_stmt;
> +  bool grouped_store;
>    unsigned int group_size, i;
>    vec<tree> dr_chain = vNULL;
>    vec<tree> oprnds = vNULL;
> @@ -5207,6 +5411,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>    gather_scatter_info gs_info;
>    enum vect_def_type scatter_src_dt = vect_unknown_def_type;
>    gimple *new_stmt;
> +  vec_load_store_type vls_type;
>
>    if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
>      return false;
> @@ -5274,6 +5479,11 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>        return false;
>      }
>
> +  if (dt == vect_constant_def || dt == vect_external_def)
> +    vls_type = VLS_STORE_INVARIANT;
> +  else
> +    vls_type = VLS_STORE;
> +
>    if (rhs_vectype && !useless_type_conversion_p (vectype, rhs_vectype))
>      return false;
>
> @@ -5303,7 +5513,6 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>         }
>        if (negative)
>         {
> -         gcc_assert (!grouped_store);
>           alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
>           if (alignment_support_scheme != dr_aligned
>               && alignment_support_scheme != dr_unaligned_supported)
> @@ -5325,80 +5534,31 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>         }
>      }
>
> -  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> -    {
> -      grouped_store = true;
> -      first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
> -      group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
> -      if (!slp && !STMT_VINFO_STRIDED_P (stmt_info))
> -       {
> -         if (vect_store_lanes_supported (vectype, group_size))
> -           store_lanes_p = true;
> -         else if (!vect_grouped_store_supported (vectype, group_size))
> -           return false;
> -       }
> -
> -      if (STMT_VINFO_STRIDED_P (stmt_info)
> -         && slp
> -         && (group_size > nunits
> -             || nunits % group_size != 0))
> -       {
> -         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                          "unhandled strided group store\n");
> -         return false;
> -       }
> -
> -      if (first_stmt == stmt)
> -       {
> -          /* STMT is the leader of the group. Check the operands of all the
> -             stmts of the group.  */
> -          next_stmt = GROUP_NEXT_ELEMENT (stmt_info);
> -          while (next_stmt)
> -            {
> -             gcc_assert (gimple_assign_single_p (next_stmt));
> -             op = gimple_assign_rhs1 (next_stmt);
> -              if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt))
> -                {
> -                  if (dump_enabled_p ())
> -                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                                     "use not simple.\n");
> -                  return false;
> -                }
> -              next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
> -            }
> -        }
> -    }
> -
> -  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> -    {
> -      gimple *def_stmt;
> -      if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
> -       gcc_unreachable ();
> -      if (!vect_is_simple_use (gs_info.offset, vinfo, &def_stmt,
> -                              &gs_info.offset_dt, &gs_info.offset_vectype))
> -       {
> -         if (dump_enabled_p ())
> -           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                             "scatter index use not simple.");
> -         return false;
> -       }
> -    }
> +  vect_memory_access_type memory_access_type;
> +  if (!get_load_store_type (stmt, vectype, slp, vls_type,
> +                           &memory_access_type, &gs_info))
> +    return false;
>
>    if (!vec_stmt) /* transformation not required.  */
>      {
> +      if (!slp)
> +       STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
>        STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
>        /* The SLP costs are calculated during SLP analysis.  */
>        if (!PURE_SLP_STMT (stmt_info))
> -       vect_model_store_cost (stmt_info, ncopies, store_lanes_p, dt,
> +       vect_model_store_cost (stmt_info, ncopies, memory_access_type, dt,
>                                NULL, NULL, NULL);
>        return true;
>      }
> +  if (!slp)
> +    gcc_assert (memory_access_type
> +               == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
>
>    /** Transform.  **/
>
>    ensure_base_align (stmt_info, dr);
>
> -  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +  if (memory_access_type == VMAT_GATHER_SCATTER)
>      {
>        tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, op, src;
>        tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
> @@ -5538,8 +5698,10 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>        return true;
>      }
>
> +  grouped_store = STMT_VINFO_GROUPED_ACCESS (stmt_info);
>    if (grouped_store)
>      {
> +      first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
>        first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
>        group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
>
> @@ -5585,7 +5747,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>      dump_printf_loc (MSG_NOTE, vect_location,
>                       "transform store. ncopies = %d\n", ncopies);
>
> -  if (STMT_VINFO_STRIDED_P (stmt_info))
> +  if (memory_access_type == VMAT_ELEMENTWISE
> +      || memory_access_type == VMAT_STRIDED_SLP)
>      {
>        gimple_stmt_iterator incr_gsi;
>        bool insert_after;
> @@ -5756,14 +5919,14 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>    gcc_assert (alignment_support_scheme);
>    /* Targets with store-lane instructions must not require explicit
>       realignment.  */
> -  gcc_assert (!store_lanes_p
> +  gcc_assert (memory_access_type != VMAT_LOAD_STORE_LANES
>               || alignment_support_scheme == dr_aligned
>               || alignment_support_scheme == dr_unaligned_supported);
>
>    if (negative)
>      offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
>
> -  if (store_lanes_p)
> +  if (memory_access_type == VMAT_LOAD_STORE_LANES)
>      aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
>    else
>      aggr_type = vectype;
> @@ -5901,7 +6064,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>                                            TYPE_SIZE_UNIT (aggr_type));
>         }
>
> -      if (store_lanes_p)
> +      if (memory_access_type == VMAT_LOAD_STORE_LANES)
>         {
>           tree vec_array;
>
> @@ -6185,7 +6348,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>    gphi *phi = NULL;
>    vec<tree> dr_chain = vNULL;
>    bool grouped_load = false;
> -  bool load_lanes_p = false;
>    gimple *first_stmt;
>    gimple *first_stmt_for_drptr = NULL;
>    bool inv_p;
> @@ -6294,48 +6456,11 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>      {
>        grouped_load = true;
>        /* FORNOW */
> -      gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info));
> +      gcc_assert (!nested_in_vect_loop);
> +      gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info));
>
>        first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
>        group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
> -      bool single_element_p = (first_stmt == stmt
> -                              && !GROUP_NEXT_ELEMENT (stmt_info));
> -
> -      if (!slp && !STMT_VINFO_STRIDED_P (stmt_info))
> -       {
> -         if (vect_load_lanes_supported (vectype, group_size))
> -           load_lanes_p = true;
> -         else if (!vect_grouped_load_supported (vectype, single_element_p,
> -                                                group_size))
> -           return false;
> -       }
> -
> -      if (single_element_p)
> -       {
> -         /* Single-element interleaving requires peeling for gaps.  */
> -         gcc_assert (GROUP_GAP (stmt_info));
> -       }
> -
> -      /* If there is a gap in the end of the group then we access excess
> -        elements in the last iteration and thus need to peel that off.  */
> -      if (loop_vinfo
> -         && ! STMT_VINFO_STRIDED_P (stmt_info)
> -         && GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0)
> -       {
> -         if (dump_enabled_p ())
> -           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                            "Data access with gaps requires scalar "
> -                            "epilogue loop\n");
> -         if (loop->inner)
> -           {
> -             if (dump_enabled_p ())
> -               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                                "Peeling for outer loop is not supported\n");
> -             return false;
> -           }
> -
> -         LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
> -       }
>
>        if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
>         slp_perm = true;
> @@ -6381,24 +6506,13 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>         }
>      }
>
> +  vect_memory_access_type memory_access_type;
> +  if (!get_load_store_type (stmt, vectype, slp, VLS_LOAD,
> +                           &memory_access_type, &gs_info))
> +    return false;
>
> -  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> -    {
> -      gimple *def_stmt;
> -      if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
> -       gcc_unreachable ();
> -      if (!vect_is_simple_use (gs_info.offset, vinfo, &def_stmt,
> -                              &gs_info.offset_dt, &gs_info.offset_vectype))
> -       {
> -         if (dump_enabled_p ())
> -           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                             "gather index use not simple.\n");
> -         return false;
> -       }
> -    }
> -  else if (STMT_VINFO_STRIDED_P (stmt_info))
> -    ;
> -  else
> +  if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info)
> +      && !STMT_VINFO_STRIDED_P (stmt_info))
>      {
>        negative = tree_int_cst_compare (nested_in_vect_loop
>                                        ? STMT_VINFO_DR_STEP (stmt_info)
> @@ -6444,14 +6558,20 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>
>    if (!vec_stmt) /* transformation not required.  */
>      {
> +      if (!slp)
> +       STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
>        STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
>        /* The SLP costs are calculated during SLP analysis.  */
>        if (!PURE_SLP_STMT (stmt_info))
> -       vect_model_load_cost (stmt_info, ncopies, load_lanes_p,
> +       vect_model_load_cost (stmt_info, ncopies, memory_access_type,
>                               NULL, NULL, NULL);
>        return true;
>      }
>
> +  if (!slp)
> +    gcc_assert (memory_access_type
> +               == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
> +
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location,
>                       "transform load. ncopies = %d\n", ncopies);
> @@ -6460,7 +6580,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>
>    ensure_base_align (stmt_info, dr);
>
> -  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +  if (memory_access_type == VMAT_GATHER_SCATTER)
>      {
>        tree vec_oprnd0 = NULL_TREE, op;
>        tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
> @@ -6627,7 +6747,9 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>         }
>        return true;
>      }
> -  else if (STMT_VINFO_STRIDED_P (stmt_info))
> +
> +  if (memory_access_type == VMAT_ELEMENTWISE
> +      || memory_access_type == VMAT_STRIDED_SLP)
>      {
>        gimple_stmt_iterator incr_gsi;
>        bool insert_after;
> @@ -6694,26 +6816,23 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>        int lnel = 1;
>        tree ltype = TREE_TYPE (vectype);
>        auto_vec<tree> dr_chain;
> -      if (slp)
> +      if (memory_access_type == VMAT_STRIDED_SLP)
>         {
> -         if (group_size < nunits
> -             && nunits % group_size == 0)
> +         nloads = nunits / group_size;
> +         if (group_size < nunits)
>             {
> -             nloads = nunits / group_size;
>               lnel = group_size;
>               ltype = build_vector_type (TREE_TYPE (vectype), group_size);
> -             ltype = build_aligned_type (ltype,
> -                                         TYPE_ALIGN (TREE_TYPE (vectype)));
>             }
> -         else if (group_size >= nunits
> -                  && group_size % nunits == 0)
> +         else
>             {
> -             nloads = 1;
>               lnel = nunits;
>               ltype = vectype;
> -             ltype = build_aligned_type (ltype,
> -                                         TYPE_ALIGN (TREE_TYPE (vectype)));
>             }
> +         ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
> +       }
> +      if (slp)
> +       {
>           /* For SLP permutation support we need to load the whole group,
>              not only the number of vector stmts the permutation result
>              fits in.  */
> @@ -6845,7 +6964,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>    gcc_assert (alignment_support_scheme);
>    /* Targets with load-lane instructions must not require explicit
>       realignment.  */
> -  gcc_assert (!load_lanes_p
> +  gcc_assert (memory_access_type != VMAT_LOAD_STORE_LANES
>               || alignment_support_scheme == dr_aligned
>               || alignment_support_scheme == dr_unaligned_supported);
>
> @@ -6980,7 +7099,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>    if (negative)
>      offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
>
> -  if (load_lanes_p)
> +  if (memory_access_type == VMAT_LOAD_STORE_LANES)
>      aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
>    else
>      aggr_type = vectype;
> @@ -7043,7 +7162,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>        if (grouped_load || slp_perm)
>         dr_chain.create (vec_num);
>
> -      if (load_lanes_p)
> +      if (memory_access_type == VMAT_LOAD_STORE_LANES)
>         {
>           tree vec_array;
>
> @@ -7313,7 +7432,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
>          {
>            if (grouped_load)
>             {
> -             if (!load_lanes_p)
> +             if (memory_access_type != VMAT_LOAD_STORE_LANES)
>                 vect_transform_grouped_load (stmt, dr_chain, group_size, gsi);
>               *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
>             }
Richard Sandiford July 3, 2016, 5:10 p.m. UTC | #3
Richard Biener <richard.guenther@gmail.com> writes:
> On Wed, Jun 15, 2016 at 10:52 AM, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>> This is the main patch in the series.  It adds a new enum and routines
>> for classifying a vector load or store implementation.
>>
>> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
>
> Why's the setting and checking of the memory access type conditional on !slp?
> I'd rather avoid doing this :/

For loads we need it for hybrid SLP, since we can vectorise the
same load twice, once for SLP and once not.  (See e.g. pr62075.c.)
For stores it was unnecessary cut-&-paste.

Is it OK with the !slp restricted to vectorizable_load?

Thanks,
Richard
Richard Biener July 4, 2016, 9:50 a.m. UTC | #4
On Sun, Jul 3, 2016 at 7:10 PM, Richard Sandiford
<rdsandiford@googlemail.com> wrote:
> Richard Biener <richard.guenther@gmail.com> writes:
>> On Wed, Jun 15, 2016 at 10:52 AM, Richard Sandiford
>> <richard.sandiford@arm.com> wrote:
>>> This is the main patch in the series.  It adds a new enum and routines
>>> for classifying a vector load or store implementation.
>>>
>>> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
>>
>> Why's the setting and checking of the memory access type conditional on !slp?
>> I'd rather avoid doing this :/
>
> For loads we need it for hybrid SLP, since we can vectorise the
> same load twice, once for SLP and once not.  (See e.g. pr62075.c.)

Ah, indeed.

> For stores it was unnecessary cut-&-paste.
>
> Is it OK with the !slp restricted to vectorizable_load?

Yes.

Thanks,
Richard.

> Thanks,
> Richard
>
diff mbox

Patch

Index: gcc/tree-vectorizer.h
===================================================================
--- gcc/tree-vectorizer.h
+++ gcc/tree-vectorizer.h
@@ -485,6 +485,33 @@  enum slp_vect_type {
   hybrid
 };
 
+/* Describes how we're going to vectorize an individual load or store,
+   or a group of loads or stores.  */
+enum vect_memory_access_type {
+  /* A simple contiguous access.  */
+  VMAT_CONTIGUOUS,
+
+  /* A simple contiguous access in which the elements need to be permuted
+     after loading or before storing.  Only used for loop vectorization;
+     SLP uses separate permutes.  */
+  VMAT_CONTIGUOUS_PERMUTE,
+
+  /* An access that uses IFN_LOAD_LANES or IFN_STORE_LANES.  */
+  VMAT_LOAD_STORE_LANES,
+
+  /* An access in which each scalar element is loaded or stored
+     individually.  */
+  VMAT_ELEMENTWISE,
+
+  /* A hybrid of VMAT_CONTIGUOUS and VMAT_ELEMENTWISE, used for grouped
+     SLP accesses.  Each unrolled iteration uses a contiguous load
+     or store for the whole group, but the groups from separate iterations
+     are combined in the same way as for VMAT_ELEMENTWISE.  */
+  VMAT_STRIDED_SLP,
+
+  /* The access uses gather loads or scatter stores.  */
+  VMAT_GATHER_SCATTER
+};
 
 typedef struct data_reference *dr_p;
 
@@ -602,6 +629,10 @@  typedef struct _stmt_vec_info {
   /* True if this is an access with loop-invariant stride.  */
   bool strided_p;
 
+  /* Classifies how the load or store is going to be implemented
+     for loop vectorization.  */
+  vect_memory_access_type memory_access_type;
+
   /* For both loads and stores.  */
   bool simd_lane_access_p;
 
@@ -659,6 +690,7 @@  STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo)
 #define STMT_VINFO_DATA_REF(S)             (S)->data_ref_info
 #define STMT_VINFO_GATHER_SCATTER_P(S)	   (S)->gather_scatter_p
 #define STMT_VINFO_STRIDED_P(S)	   	   (S)->strided_p
+#define STMT_VINFO_MEMORY_ACCESS_TYPE(S)   (S)->memory_access_type
 #define STMT_VINFO_SIMD_LANE_ACCESS_P(S)   (S)->simd_lane_access_p
 #define STMT_VINFO_VEC_REDUCTION_TYPE(S)   (S)->v_reduc_type
 
@@ -1006,12 +1038,12 @@  extern void free_stmt_vec_info (gimple *stmt);
 extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *,
                                     stmt_vector_for_cost *,
 				    stmt_vector_for_cost *);
-extern void vect_model_store_cost (stmt_vec_info, int, bool,
+extern void vect_model_store_cost (stmt_vec_info, int, vect_memory_access_type,
 				   enum vect_def_type, slp_tree,
 				   stmt_vector_for_cost *,
 				   stmt_vector_for_cost *);
-extern void vect_model_load_cost (stmt_vec_info, int, bool, slp_tree,
-				  stmt_vector_for_cost *,
+extern void vect_model_load_cost (stmt_vec_info, int, vect_memory_access_type,
+				  slp_tree, stmt_vector_for_cost *,
 				  stmt_vector_for_cost *);
 extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
 				  enum vect_cost_for_stmt, stmt_vec_info,
Index: gcc/tree-vect-slp.c
===================================================================
--- gcc/tree-vect-slp.c
+++ gcc/tree-vect-slp.c
@@ -1490,9 +1490,13 @@  vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
   stmt_info = vinfo_for_stmt (stmt);
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
+      vect_memory_access_type memory_access_type
+	= (STMT_VINFO_STRIDED_P (stmt_info)
+	   ? VMAT_STRIDED_SLP
+	   : VMAT_CONTIGUOUS);
       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
-	vect_model_store_cost (stmt_info, ncopies_for_cost, false,
-			       vect_uninitialized_def,
+	vect_model_store_cost (stmt_info, ncopies_for_cost,
+			       memory_access_type, vect_uninitialized_def,
 			       node, prologue_cost_vec, body_cost_vec);
       else
 	{
@@ -1515,8 +1519,9 @@  vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
 	      ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance);
 	    }
 	  /* Record the cost for the vector loads.  */
-	  vect_model_load_cost (stmt_info, ncopies_for_cost, false,
-				node, prologue_cost_vec, body_cost_vec);
+	  vect_model_load_cost (stmt_info, ncopies_for_cost,
+				memory_access_type, node, prologue_cost_vec,
+				body_cost_vec);
 	  return;
 	}
     }
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c
+++ gcc/tree-vect-stmts.c
@@ -52,6 +52,14 @@  along with GCC; see the file COPYING3.  If not see
 /* For lang_hooks.types.type_for_mode.  */
 #include "langhooks.h"
 
+/* Says whether a statement is a load, a store of a vectorized statement
+   result, or a store of an invariant value.  */
+enum vec_load_store_type {
+  VLS_LOAD,
+  VLS_STORE,
+  VLS_STORE_INVARIANT
+};
+
 /* Return the vectorized type for the given statement.  */
 
 tree
@@ -873,8 +881,8 @@  vect_model_promotion_demotion_cost (stmt_vec_info stmt_info,
 
 void
 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
-		       bool store_lanes_p, enum vect_def_type dt,
-		       slp_tree slp_node,
+		       vect_memory_access_type memory_access_type,
+		       enum vect_def_type dt, slp_tree slp_node,
 		       stmt_vector_for_cost *prologue_cost_vec,
 		       stmt_vector_for_cost *body_cost_vec)
 {
@@ -903,14 +911,9 @@  vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
   /* We assume that the cost of a single store-lanes instruction is
      equivalent to the cost of GROUP_SIZE separate stores.  If a grouped
      access is instead being provided by a permute-and-store operation,
-     include the cost of the permutes.
-
-     For SLP, the caller has already counted the permutation, if any.  */
-  if (grouped_access_p
-      && first_stmt_p
-      && !store_lanes_p
-      && !STMT_VINFO_STRIDED_P (stmt_info)
-      && !slp_node)
+     include the cost of the permutes.  */
+  if (first_stmt_p
+      && memory_access_type == VMAT_CONTIGUOUS_PERMUTE)
     {
       /* Uses a high and low interleave or shuffle operations for each
 	 needed permute.  */
@@ -927,17 +930,16 @@  vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
 
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   /* Costs of the stores.  */
-  if (STMT_VINFO_STRIDED_P (stmt_info) && !(slp_node && grouped_access_p))
-    {
-      /* N scalar stores plus extracting the elements.  */
-      inside_cost += record_stmt_cost (body_cost_vec,
-				       ncopies * TYPE_VECTOR_SUBPARTS (vectype),
-				       scalar_store, stmt_info, 0, vect_body);
-    }
+  if (memory_access_type == VMAT_ELEMENTWISE)
+    /* N scalar stores plus extracting the elements.  */
+    inside_cost += record_stmt_cost (body_cost_vec,
+				     ncopies * TYPE_VECTOR_SUBPARTS (vectype),
+				     scalar_store, stmt_info, 0, vect_body);
   else
     vect_get_store_cost (dr, ncopies, &inside_cost, body_cost_vec);
 
-  if (STMT_VINFO_STRIDED_P (stmt_info))
+  if (memory_access_type == VMAT_ELEMENTWISE
+      || memory_access_type == VMAT_STRIDED_SLP)
     inside_cost += record_stmt_cost (body_cost_vec,
 				     ncopies * TYPE_VECTOR_SUBPARTS (vectype),
 				     vec_to_scalar, stmt_info, 0, vect_body);
@@ -1011,7 +1013,8 @@  vect_get_store_cost (struct data_reference *dr, int ncopies,
 
 void
 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
-		      bool load_lanes_p, slp_tree slp_node,
+		      vect_memory_access_type memory_access_type,
+		      slp_tree slp_node,
 		      stmt_vector_for_cost *prologue_cost_vec,
 		      stmt_vector_for_cost *body_cost_vec)
 {
@@ -1036,14 +1039,9 @@  vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
   /* We assume that the cost of a single load-lanes instruction is
      equivalent to the cost of GROUP_SIZE separate loads.  If a grouped
      access is instead being provided by a load-and-permute operation,
-     include the cost of the permutes.
-
-     For SLP, the caller has already counted the permutation, if any.  */
-  if (grouped_access_p
-      && first_stmt_p
-      && !load_lanes_p
-      && !STMT_VINFO_STRIDED_P (stmt_info)
-      && !slp_node)
+     include the cost of the permutes.  */
+  if (first_stmt_p
+      && memory_access_type == VMAT_CONTIGUOUS_PERMUTE)
     {
       /* Uses an even and odd extract operations or shuffle operations
 	 for each needed permute.  */
@@ -1059,7 +1057,7 @@  vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
     }
 
   /* The loads themselves.  */
-  if (STMT_VINFO_STRIDED_P (stmt_info) && !(slp_node && grouped_access_p))
+  if (memory_access_type == VMAT_ELEMENTWISE)
     {
       /* N scalar loads plus gathering them into a vector.  */
       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
@@ -1071,7 +1069,8 @@  vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
     vect_get_load_cost (dr, ncopies, first_stmt_p,
 			&inside_cost, &prologue_cost, 
 			prologue_cost_vec, body_cost_vec, true);
-  if (STMT_VINFO_STRIDED_P (stmt_info))
+  if (memory_access_type == VMAT_ELEMENTWISE
+      || memory_access_type == VMAT_STRIDED_SLP)
     inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_construct,
 				     stmt_info, 0, vect_body);
 
@@ -1674,6 +1673,209 @@  static tree permute_vec_elements (tree, tree, tree, gimple *,
 				  gimple_stmt_iterator *);
 
 
+/* A subroutine of get_load_store_type, with a subset of the same
+   arguments.  Handle the case where STMT is part of a grouped load
+   or store.
+
+   For stores, the statements in the group are all consecutive
+   and there is no gap at the end.  For loads, the statements in the
+   group might not be consecutive; there can be gaps between statements
+   as well as at the end.  */
+
+static bool
+get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
+			   vec_load_store_type vls_type,
+			   vect_memory_access_type *memory_access_type)
+{
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  vec_info *vinfo = stmt_info->vinfo;
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
+  gimple *first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
+  unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
+  bool single_element_p = (stmt == first_stmt
+			   && !GROUP_NEXT_ELEMENT (stmt_info));
+  unsigned HOST_WIDE_INT gap = GROUP_GAP (vinfo_for_stmt (first_stmt));
+  int nunits = TYPE_VECTOR_SUBPARTS (vectype);
+
+  /* True if the vectorized statements would access beyond the last
+     statement in the group.  */
+  bool overrun_p = false;
+
+  /* True if we can cope with such overrun by peeling for gaps, so that
+     there is at least one final scalar iteration after the vector loop.  */
+  bool can_overrun_p = (vls_type == VLS_LOAD && loop_vinfo && !loop->inner);
+
+  /* There can only be a gap at the end of the group if the stride is
+     known at compile time.  */
+  gcc_assert (!STMT_VINFO_STRIDED_P (stmt_info) || gap == 0);
+
+  /* Stores can't yet have gaps.  */
+  gcc_assert (slp || vls_type == VLS_LOAD || gap == 0);
+
+  if (slp)
+    {
+      if (STMT_VINFO_STRIDED_P (stmt_info))
+	{
+	  /* Try to use consecutive accesses of GROUP_SIZE elements,
+	     separated by the stride, until we have a complete vector.
+	     Fall back to scalar accesses if that isn't possible.  */
+	  if (nunits % group_size == 0)
+	    *memory_access_type = VMAT_STRIDED_SLP;
+	  else
+	    *memory_access_type = VMAT_ELEMENTWISE;
+	}
+      else
+	{
+	  overrun_p = loop_vinfo && gap != 0;
+	  if (overrun_p && vls_type != VLS_LOAD)
+	    {
+	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			       "Grouped store with gaps requires"
+			       " non-consecutive accesses\n");
+	      return false;
+	    }
+	  if (overrun_p && !can_overrun_p)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "Peeling for outer loop is not supported\n");
+	      return false;
+	    }
+	  *memory_access_type = VMAT_CONTIGUOUS;
+	}
+    }
+  else
+    {
+      /* We can always handle this case using elementwise accesses,
+	 but see if something more efficient is available.  */
+      *memory_access_type = VMAT_ELEMENTWISE;
+
+      /* If there is a gap at the end of the group then these optimizations
+	 would access excess elements in the last iteration.  */
+      bool would_overrun_p = (gap != 0);
+      if (!STMT_VINFO_STRIDED_P (stmt_info)
+	  && (can_overrun_p || !would_overrun_p))
+	{
+	  /* First try using LOAD/STORE_LANES.  */
+	  if (vls_type == VLS_LOAD
+	      ? vect_load_lanes_supported (vectype, group_size)
+	      : vect_store_lanes_supported (vectype, group_size))
+	    {
+	      *memory_access_type = VMAT_LOAD_STORE_LANES;
+	      overrun_p = would_overrun_p;
+	    }
+
+	  /* If that fails, try using permuting loads.  */
+	  if (*memory_access_type == VMAT_ELEMENTWISE
+	      && (vls_type == VLS_LOAD
+		  ? vect_grouped_load_supported (vectype, single_element_p,
+						 group_size)
+		  : vect_grouped_store_supported (vectype, group_size)))
+	    {
+	      *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
+	      overrun_p = would_overrun_p;
+	    }
+	}
+    }
+
+  if (vls_type != VLS_LOAD && first_stmt == stmt)
+    {
+      /* STMT is the leader of the group. Check the operands of all the
+	 stmts of the group.  */
+      gimple *next_stmt = GROUP_NEXT_ELEMENT (stmt_info);
+      while (next_stmt)
+	{
+	  gcc_assert (gimple_assign_single_p (next_stmt));
+	  tree op = gimple_assign_rhs1 (next_stmt);
+	  gimple *def_stmt;
+	  enum vect_def_type dt;
+	  if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "use not simple.\n");
+	      return false;
+	    }
+	  next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
+	}
+    }
+
+  if (overrun_p)
+    {
+      gcc_assert (can_overrun_p);
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "Data access with gaps requires scalar "
+			 "epilogue loop\n");
+      LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
+    }
+
+  return true;
+}
+
+/* Analyze load or store statement STMT of type VLS_TYPE.  Return true
+   if there is a memory access type that the vectorized form can use,
+   storing it in *MEMORY_ACCESS_TYPE if so.  If we decide to use gathers
+   or scatters, fill in GS_INFO accordingly.
+
+   SLP says whether we're performing SLP rather than loop vectorization.
+   VECTYPE is the vector type that the vectorized statements will use.  */
+
+static bool
+get_load_store_type (gimple *stmt, tree vectype, bool slp,
+		     vec_load_store_type vls_type,
+		     vect_memory_access_type *memory_access_type,
+		     gather_scatter_info *gs_info)
+{
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  vec_info *vinfo = stmt_info->vinfo;
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+    {
+      *memory_access_type = VMAT_GATHER_SCATTER;
+      gimple *def_stmt;
+      if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info))
+	gcc_unreachable ();
+      else if (!vect_is_simple_use (gs_info->offset, vinfo, &def_stmt,
+				    &gs_info->offset_dt,
+				    &gs_info->offset_vectype))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "%s index use not simple.\n",
+			     vls_type == VLS_LOAD ? "gather" : "scatter");
+	  return false;
+	}
+    }
+  else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    {
+      if (!get_group_load_store_type (stmt, vectype, slp, vls_type,
+				      memory_access_type))
+	return false;
+    }
+  else if (STMT_VINFO_STRIDED_P (stmt_info))
+    {
+      gcc_assert (!slp);
+      *memory_access_type = VMAT_ELEMENTWISE;
+    }
+  else
+    *memory_access_type = VMAT_CONTIGUOUS;
+
+  /* FIXME: At the moment the cost model seems to underestimate the
+     cost of using elementwise accesses.  This check preserves the
+     traditional behavior until that can be fixed.  */
+  if (*memory_access_type == VMAT_ELEMENTWISE
+      && !STMT_VINFO_STRIDED_P (stmt_info))
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "not falling back to elementwise accesses\n");
+      return false;
+    }
+  return true;
+}
+
 /* Function vectorizable_mask_load_store.
 
    Check if STMT performs a conditional load or store that can be vectorized.
@@ -1705,7 +1907,7 @@  vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
   int i, j;
   bool inv_p;
   gather_scatter_info gs_info;
-  bool is_store;
+  vec_load_store_type vls_type;
   tree mask;
   gimple *def_stmt;
   enum vect_def_type dt;
@@ -1716,7 +1918,6 @@  vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
   ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
   gcc_assert (ncopies >= 1);
 
-  is_store = gimple_call_internal_fn (stmt) == IFN_MASK_STORE;
   mask = gimple_call_arg (stmt, 2);
 
   if (TREE_CODE (TREE_TYPE (mask)) != BOOLEAN_TYPE)
@@ -1743,12 +1944,6 @@  vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
 
   elem_type = TREE_TYPE (vectype);
 
-  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
-    return false;
-
-  if (STMT_VINFO_STRIDED_P (stmt_info))
-    return false;
-
   if (TREE_CODE (mask) != SSA_NAME)
     return false;
 
@@ -1762,27 +1957,26 @@  vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
       || TYPE_VECTOR_SUBPARTS (mask_vectype) != TYPE_VECTOR_SUBPARTS (vectype))
     return false;
 
-  if (is_store)
+  if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
     {
       tree rhs = gimple_call_arg (stmt, 3);
       if (!vect_is_simple_use (rhs, loop_vinfo, &def_stmt, &dt, &rhs_vectype))
 	return false;
+      if (dt == vect_constant_def || dt == vect_external_def)
+	vls_type = VLS_STORE_INVARIANT;
+      else
+	vls_type = VLS_STORE;
     }
+  else
+    vls_type = VLS_LOAD;
 
-  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-    {
-      gimple *def_stmt;
-      if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
-	gcc_unreachable ();
-      if (!vect_is_simple_use (gs_info.offset, loop_vinfo, &def_stmt,
-			       &gs_info.offset_dt, &gs_info.offset_vectype))
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "gather index use not simple.");
-	  return false;
-	}
+  vect_memory_access_type memory_access_type;
+  if (!get_load_store_type (stmt, vectype, false, vls_type,
+			    &memory_access_type, &gs_info))
+    return false;
 
+  if (memory_access_type == VMAT_GATHER_SCATTER)
+    {
       tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
       tree masktype
 	= TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist))));
@@ -1794,6 +1988,14 @@  vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
 	  return false;
 	}
     }
+  else if (memory_access_type != VMAT_CONTIGUOUS)
+    {
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			 "unsupported access type for masked %s\n",
+			 vls_type == VLS_LOAD ? "load" : "store");
+      return false;
+    }
   else if (tree_int_cst_compare (nested_in_vect_loop
 				 ? STMT_VINFO_DR_STEP (stmt_info)
 				 : DR_STEP (dr), size_zero_node) <= 0)
@@ -1801,25 +2003,28 @@  vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
   else if (!VECTOR_MODE_P (TYPE_MODE (vectype))
 	   || !can_vec_mask_load_store_p (TYPE_MODE (vectype),
 					  TYPE_MODE (mask_vectype),
-					  !is_store)
+					  vls_type == VLS_LOAD)
 	   || (rhs_vectype
 	       && !useless_type_conversion_p (vectype, rhs_vectype)))
     return false;
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
       STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
-      if (is_store)
-	vect_model_store_cost (stmt_info, ncopies, false, dt,
-			       NULL, NULL, NULL);
+      if (vls_type == VLS_LOAD)
+	vect_model_load_cost (stmt_info, ncopies, memory_access_type,
+			      NULL, NULL, NULL);
       else
-	vect_model_load_cost (stmt_info, ncopies, false, NULL, NULL, NULL);
+	vect_model_store_cost (stmt_info, ncopies, memory_access_type,
+			       dt, NULL, NULL, NULL);
       return true;
     }
+  gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
 
   /** Transform.  **/
 
-  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+  if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       tree vec_oprnd0 = NULL_TREE, op;
       tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
@@ -1993,7 +2198,7 @@  vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
       gsi_replace (gsi, new_stmt, true);
       return true;
     }
-  else if (is_store)
+  else if (vls_type != VLS_LOAD)
     {
       tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
       prev_stmt_info = NULL;
@@ -2102,7 +2307,7 @@  vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi,
 	}
     }
 
-  if (!is_store)
+  if (vls_type == VLS_LOAD)
     {
       /* Ensure that even with -fno-tree-dce the scalar MASK_LOAD is removed
 	 from the IL.  */
@@ -5188,9 +5393,8 @@  vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
   gimple *ptr_incr = NULL;
   int ncopies;
   int j;
-  gimple *next_stmt, *first_stmt = NULL;
-  bool grouped_store = false;
-  bool store_lanes_p = false;
+  gimple *next_stmt, *first_stmt;
+  bool grouped_store;
   unsigned int group_size, i;
   vec<tree> dr_chain = vNULL;
   vec<tree> oprnds = vNULL;
@@ -5207,6 +5411,7 @@  vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
   gather_scatter_info gs_info;
   enum vect_def_type scatter_src_dt = vect_unknown_def_type;
   gimple *new_stmt;
+  vec_load_store_type vls_type;
 
   if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
     return false;
@@ -5274,6 +5479,11 @@  vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
       return false;
     }
 
+  if (dt == vect_constant_def || dt == vect_external_def)
+    vls_type = VLS_STORE_INVARIANT;
+  else
+    vls_type = VLS_STORE;
+
   if (rhs_vectype && !useless_type_conversion_p (vectype, rhs_vectype))
     return false;
 
@@ -5303,7 +5513,6 @@  vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	}
       if (negative)
 	{
-	  gcc_assert (!grouped_store);
 	  alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
 	  if (alignment_support_scheme != dr_aligned
 	      && alignment_support_scheme != dr_unaligned_supported)
@@ -5325,80 +5534,31 @@  vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	}
     }
 
-  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
-    {
-      grouped_store = true;
-      first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
-      group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
-      if (!slp && !STMT_VINFO_STRIDED_P (stmt_info))
-	{
-	  if (vect_store_lanes_supported (vectype, group_size))
-	    store_lanes_p = true;
-	  else if (!vect_grouped_store_supported (vectype, group_size))
-	    return false;
-	}
-
-      if (STMT_VINFO_STRIDED_P (stmt_info)
-	  && slp
-	  && (group_size > nunits
-	      || nunits % group_size != 0))
-	{
-	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			   "unhandled strided group store\n");
-	  return false;
-	}
-
-      if (first_stmt == stmt)
-	{
-          /* STMT is the leader of the group. Check the operands of all the
-             stmts of the group.  */
-          next_stmt = GROUP_NEXT_ELEMENT (stmt_info);
-          while (next_stmt)
-            {
-	      gcc_assert (gimple_assign_single_p (next_stmt));
-	      op = gimple_assign_rhs1 (next_stmt);
-              if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt))
-                {
-                  if (dump_enabled_p ())
-                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                     "use not simple.\n");
-                  return false;
-                }
-              next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
-            }
-        }
-    }
-
-  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-    {
-      gimple *def_stmt;
-      if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
-	gcc_unreachable ();
-      if (!vect_is_simple_use (gs_info.offset, vinfo, &def_stmt,
-			       &gs_info.offset_dt, &gs_info.offset_vectype))
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                             "scatter index use not simple.");
-	  return false;
-	}
-    }
+  vect_memory_access_type memory_access_type;
+  if (!get_load_store_type (stmt, vectype, slp, vls_type,
+			    &memory_access_type, &gs_info))
+    return false;
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      if (!slp)
+	STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
       STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
       /* The SLP costs are calculated during SLP analysis.  */
       if (!PURE_SLP_STMT (stmt_info))
-	vect_model_store_cost (stmt_info, ncopies, store_lanes_p, dt,
+	vect_model_store_cost (stmt_info, ncopies, memory_access_type, dt,
 			       NULL, NULL, NULL);
       return true;
     }
+  if (!slp)
+    gcc_assert (memory_access_type
+		== STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
 
   /** Transform.  **/
 
   ensure_base_align (stmt_info, dr);
 
-  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+  if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, op, src;
       tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
@@ -5538,8 +5698,10 @@  vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
       return true;
     }
 
+  grouped_store = STMT_VINFO_GROUPED_ACCESS (stmt_info);
   if (grouped_store)
     {
+      first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
       first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
       group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
 
@@ -5585,7 +5747,8 @@  vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
     dump_printf_loc (MSG_NOTE, vect_location,
                      "transform store. ncopies = %d\n", ncopies);
 
-  if (STMT_VINFO_STRIDED_P (stmt_info))
+  if (memory_access_type == VMAT_ELEMENTWISE
+      || memory_access_type == VMAT_STRIDED_SLP)
     {
       gimple_stmt_iterator incr_gsi;
       bool insert_after;
@@ -5756,14 +5919,14 @@  vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
   gcc_assert (alignment_support_scheme);
   /* Targets with store-lane instructions must not require explicit
      realignment.  */
-  gcc_assert (!store_lanes_p
+  gcc_assert (memory_access_type != VMAT_LOAD_STORE_LANES
 	      || alignment_support_scheme == dr_aligned
 	      || alignment_support_scheme == dr_unaligned_supported);
 
   if (negative)
     offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
 
-  if (store_lanes_p)
+  if (memory_access_type == VMAT_LOAD_STORE_LANES)
     aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
   else
     aggr_type = vectype;
@@ -5901,7 +6064,7 @@  vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 					   TYPE_SIZE_UNIT (aggr_type));
 	}
 
-      if (store_lanes_p)
+      if (memory_access_type == VMAT_LOAD_STORE_LANES)
 	{
 	  tree vec_array;
 
@@ -6185,7 +6348,6 @@  vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
   gphi *phi = NULL;
   vec<tree> dr_chain = vNULL;
   bool grouped_load = false;
-  bool load_lanes_p = false;
   gimple *first_stmt;
   gimple *first_stmt_for_drptr = NULL;
   bool inv_p;
@@ -6294,48 +6456,11 @@  vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
     {
       grouped_load = true;
       /* FORNOW */
-      gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info));
+      gcc_assert (!nested_in_vect_loop);
+      gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 
       first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
       group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
-      bool single_element_p = (first_stmt == stmt
-			       && !GROUP_NEXT_ELEMENT (stmt_info));
-
-      if (!slp && !STMT_VINFO_STRIDED_P (stmt_info))
-	{
-	  if (vect_load_lanes_supported (vectype, group_size))
-	    load_lanes_p = true;
-	  else if (!vect_grouped_load_supported (vectype, single_element_p,
-						 group_size))
-	    return false;
-	}
-
-      if (single_element_p)
-	{
-	  /* Single-element interleaving requires peeling for gaps.  */
-	  gcc_assert (GROUP_GAP (stmt_info));
-	}
-
-      /* If there is a gap in the end of the group then we access excess
-	 elements in the last iteration and thus need to peel that off.  */
-      if (loop_vinfo
-	  && ! STMT_VINFO_STRIDED_P (stmt_info)
-	  && GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0)
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "Data access with gaps requires scalar "
-			     "epilogue loop\n");
-	  if (loop->inner)
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "Peeling for outer loop is not supported\n");
-	      return false;
-	    }
-
-	  LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
-	}
 
       if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
 	slp_perm = true;
@@ -6381,24 +6506,13 @@  vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	}
     }
 
+  vect_memory_access_type memory_access_type;
+  if (!get_load_store_type (stmt, vectype, slp, VLS_LOAD,
+			    &memory_access_type, &gs_info))
+    return false;
 
-  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-    {
-      gimple *def_stmt;
-      if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
-	gcc_unreachable ();
-      if (!vect_is_simple_use (gs_info.offset, vinfo, &def_stmt,
-			       &gs_info.offset_dt, &gs_info.offset_vectype))
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                             "gather index use not simple.\n");
-	  return false;
-	}
-    }
-  else if (STMT_VINFO_STRIDED_P (stmt_info))
-    ;
-  else
+  if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info)
+      && !STMT_VINFO_STRIDED_P (stmt_info))
     {
       negative = tree_int_cst_compare (nested_in_vect_loop
 				       ? STMT_VINFO_DR_STEP (stmt_info)
@@ -6444,14 +6558,20 @@  vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      if (!slp)
+	STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
       STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
       /* The SLP costs are calculated during SLP analysis.  */
       if (!PURE_SLP_STMT (stmt_info))
-	vect_model_load_cost (stmt_info, ncopies, load_lanes_p,
+	vect_model_load_cost (stmt_info, ncopies, memory_access_type,
 			      NULL, NULL, NULL);
       return true;
     }
 
+  if (!slp)
+    gcc_assert (memory_access_type
+		== STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
+
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
                      "transform load. ncopies = %d\n", ncopies);
@@ -6460,7 +6580,7 @@  vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 
   ensure_base_align (stmt_info, dr);
 
-  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+  if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       tree vec_oprnd0 = NULL_TREE, op;
       tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
@@ -6627,7 +6747,9 @@  vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	}
       return true;
     }
-  else if (STMT_VINFO_STRIDED_P (stmt_info))
+
+  if (memory_access_type == VMAT_ELEMENTWISE
+      || memory_access_type == VMAT_STRIDED_SLP)
     {
       gimple_stmt_iterator incr_gsi;
       bool insert_after;
@@ -6694,26 +6816,23 @@  vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
       int lnel = 1;
       tree ltype = TREE_TYPE (vectype);
       auto_vec<tree> dr_chain;
-      if (slp)
+      if (memory_access_type == VMAT_STRIDED_SLP)
 	{
-	  if (group_size < nunits
-	      && nunits % group_size == 0)
+	  nloads = nunits / group_size;
+	  if (group_size < nunits)
 	    {
-	      nloads = nunits / group_size;
 	      lnel = group_size;
 	      ltype = build_vector_type (TREE_TYPE (vectype), group_size);
-	      ltype = build_aligned_type (ltype,
-					  TYPE_ALIGN (TREE_TYPE (vectype)));
 	    }
-	  else if (group_size >= nunits
-		   && group_size % nunits == 0)
+	  else
 	    {
-	      nloads = 1;
 	      lnel = nunits;
 	      ltype = vectype;
-	      ltype = build_aligned_type (ltype,
-					  TYPE_ALIGN (TREE_TYPE (vectype)));
 	    }
+	  ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
+	}
+      if (slp)
+	{
 	  /* For SLP permutation support we need to load the whole group,
 	     not only the number of vector stmts the permutation result
 	     fits in.  */
@@ -6845,7 +6964,7 @@  vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
   gcc_assert (alignment_support_scheme);
   /* Targets with load-lane instructions must not require explicit
      realignment.  */
-  gcc_assert (!load_lanes_p
+  gcc_assert (memory_access_type != VMAT_LOAD_STORE_LANES
 	      || alignment_support_scheme == dr_aligned
 	      || alignment_support_scheme == dr_unaligned_supported);
 
@@ -6980,7 +7099,7 @@  vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
   if (negative)
     offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
 
-  if (load_lanes_p)
+  if (memory_access_type == VMAT_LOAD_STORE_LANES)
     aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
   else
     aggr_type = vectype;
@@ -7043,7 +7162,7 @@  vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
       if (grouped_load || slp_perm)
 	dr_chain.create (vec_num);
 
-      if (load_lanes_p)
+      if (memory_access_type == VMAT_LOAD_STORE_LANES)
 	{
 	  tree vec_array;
 
@@ -7313,7 +7432,7 @@  vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
         {
           if (grouped_load)
   	    {
-	      if (!load_lanes_p)
+	      if (memory_access_type != VMAT_LOAD_STORE_LANES)
 		vect_transform_grouped_load (stmt, dr_chain, group_size, gsi);
 	      *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
 	    }