diff mbox series

[05/10] vect: Adjust vectorizable_store costing on VMAT_ELEMENTWISE and VMAT_STRIDED_SLP

Message ID 2adef8b10433859b6642282b03a11df33c732d11.1694657494.git.linkw@linux.ibm.com
State New
Headers show
Series vect: Move costing next to the transform for vect store | expand

Commit Message

Kewen.Lin Sept. 14, 2023, 3:11 a.m. UTC
This patch adjusts the cost handling on VMAT_ELEMENTWISE
and VMAT_STRIDED_SLP in function vectorizable_store.  We
don't call function vect_model_store_cost for them any more.

Like what we improved for PR82255 on load side, this change
helps us to get rid of unnecessary vec_to_scalar costing
for some case with VMAT_STRIDED_SLP.  One typical test case
gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c has been
associated.  And it helps some cases with some inconsistent
costing too.

Besides, this also special-cases the interleaving stores
for these two affected memory access types, since for the
interleaving stores the whole chain is vectorized when the
last store in the chain is reached, the other stores in the
group would be skipped.  To keep consistent with this and
follows the transforming handlings like iterating the whole
group, it only costs for the first store in the group.
Ideally we can only cost for the last one but it's not
trivial and using the first one is actually equivalent.

gcc/ChangeLog:

	* tree-vect-stmts.cc (vect_model_store_cost): Assert it won't get
	VMAT_ELEMENTWISE and VMAT_STRIDED_SLP any more, and remove their
	related handlings.
	(vectorizable_store): Adjust the cost handling on VMAT_ELEMENTWISE
	and VMAT_STRIDED_SLP without calling vect_model_store_cost.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c: New test.
---
 .../costmodel/ppc/costmodel-vect-store-1.c    |  23 +++
 gcc/tree-vect-stmts.cc                        | 160 +++++++++++-------
 2 files changed, 120 insertions(+), 63 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c

Comments

Richard Biener Sept. 27, 2023, 11:26 a.m. UTC | #1
On Thu, Sep 14, 2023 at 5:12 AM Kewen Lin <linkw@linux.ibm.com> wrote:
>
> This patch adjusts the cost handling on VMAT_ELEMENTWISE
> and VMAT_STRIDED_SLP in function vectorizable_store.  We
> don't call function vect_model_store_cost for them any more.
>
> Like what we improved for PR82255 on load side, this change
> helps us to get rid of unnecessary vec_to_scalar costing
> for some case with VMAT_STRIDED_SLP.  One typical test case
> gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c has been
> associated.  And it helps some cases with some inconsistent
> costing too.
>
> Besides, this also special-cases the interleaving stores
> for these two affected memory access types, since for the
> interleaving stores the whole chain is vectorized when the
> last store in the chain is reached, the other stores in the
> group would be skipped.  To keep consistent with this and
> follows the transforming handlings like iterating the whole
> group, it only costs for the first store in the group.
> Ideally we can only cost for the last one but it's not
> trivial and using the first one is actually equivalent.

OK

> gcc/ChangeLog:
>
>         * tree-vect-stmts.cc (vect_model_store_cost): Assert it won't get
>         VMAT_ELEMENTWISE and VMAT_STRIDED_SLP any more, and remove their
>         related handlings.
>         (vectorizable_store): Adjust the cost handling on VMAT_ELEMENTWISE
>         and VMAT_STRIDED_SLP without calling vect_model_store_cost.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c: New test.
> ---
>  .../costmodel/ppc/costmodel-vect-store-1.c    |  23 +++
>  gcc/tree-vect-stmts.cc                        | 160 +++++++++++-------
>  2 files changed, 120 insertions(+), 63 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c
> new file mode 100644
> index 00000000000..ab5f3301492
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-O3" }
> +
> +/* This test case is partially extracted from case
> +   gcc.dg/vect/vect-avg-16.c, it's to verify we don't
> +   cost a store with vec_to_scalar when we shouldn't.  */
> +
> +void
> +test (signed char *restrict a, signed char *restrict b, signed char *restrict c,
> +      int n)
> +{
> +  for (int j = 0; j < n; ++j)
> +    {
> +      for (int i = 0; i < 16; ++i)
> +       a[i] = (b[i] + c[i]) >> 1;
> +      a += 20;
> +      b += 20;
> +      c += 20;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vec_to_scalar" 0 "vect" } } */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 048c14d291c..3d01168080a 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -964,7 +964,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies,
>                        vec_load_store_type vls_type, slp_tree slp_node,
>                        stmt_vector_for_cost *cost_vec)
>  {
> -  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER);
> +  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
> +             && memory_access_type != VMAT_ELEMENTWISE
> +             && memory_access_type != VMAT_STRIDED_SLP);
>    unsigned int inside_cost = 0, prologue_cost = 0;
>    stmt_vec_info first_stmt_info = stmt_info;
>    bool grouped_access_p = STMT_VINFO_GROUPED_ACCESS (stmt_info);
> @@ -1010,29 +1012,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies,
>                           group_size);
>      }
>
> -  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>    /* Costs of the stores.  */
> -  if (memory_access_type == VMAT_ELEMENTWISE)
> -    {
> -      unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
> -      /* N scalar stores plus extracting the elements.  */
> -      inside_cost += record_stmt_cost (cost_vec,
> -                                      ncopies * assumed_nunits,
> -                                      scalar_store, stmt_info, 0, vect_body);
> -    }
> -  else
> -    vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> -                        misalignment, &inside_cost, cost_vec);
> -
> -  if (memory_access_type == VMAT_ELEMENTWISE
> -      || memory_access_type == VMAT_STRIDED_SLP)
> -    {
> -      /* N scalar stores plus extracting the elements.  */
> -      unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
> -      inside_cost += record_stmt_cost (cost_vec,
> -                                      ncopies * assumed_nunits,
> -                                      vec_to_scalar, stmt_info, 0, vect_body);
> -    }
> +  vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> +                      misalignment, &inside_cost, cost_vec);
>
>    /* When vectorizing a store into the function result assign
>       a penalty if the function returns in a multi-register location.
> @@ -8416,6 +8398,18 @@ vectorizable_store (vec_info *vinfo,
>                          "Vectorizing an unaligned access.\n");
>
>        STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
> +
> +      /* As function vect_transform_stmt shows, for interleaving stores
> +        the whole chain is vectorized when the last store in the chain
> +        is reached, the other stores in the group are skipped.  So we
> +        want to only cost the last one here, but it's not trivial to
> +        get the last, as it's equivalent to use the first one for
> +        costing, use the first one instead.  */
> +      if (grouped_store
> +         && !slp
> +         && first_stmt_info != stmt_info
> +         && memory_access_type == VMAT_ELEMENTWISE)
> +       return true;
>      }
>    gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
>
> @@ -8488,14 +8482,7 @@ vectorizable_store (vec_info *vinfo,
>    if (memory_access_type == VMAT_ELEMENTWISE
>        || memory_access_type == VMAT_STRIDED_SLP)
>      {
> -      if (costing_p)
> -       {
> -         vect_model_store_cost (vinfo, stmt_info, ncopies, memory_access_type,
> -                                alignment_support_scheme, misalignment,
> -                                vls_type, slp_node, cost_vec);
> -         return true;
> -       }
> -
> +      unsigned inside_cost = 0, prologue_cost = 0;
>        gimple_stmt_iterator incr_gsi;
>        bool insert_after;
>        gimple *incr;
> @@ -8503,7 +8490,7 @@ vectorizable_store (vec_info *vinfo,
>        tree ivstep;
>        tree running_off;
>        tree stride_base, stride_step, alias_off;
> -      tree vec_oprnd;
> +      tree vec_oprnd = NULL_TREE;
>        tree dr_offset;
>        unsigned int g;
>        /* Checked by get_load_store_type.  */
> @@ -8609,26 +8596,30 @@ vectorizable_store (vec_info *vinfo,
>                   lnel = const_nunits;
>                   ltype = vectype;
>                   lvectype = vectype;
> +                 alignment_support_scheme = dr_align;
> +                 misalignment = mis_align;
>                 }
>             }
>           ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
>           ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
>         }
>
> -      ivstep = stride_step;
> -      ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
> -                           build_int_cst (TREE_TYPE (ivstep), vf));
> +      if (!costing_p)
> +       {
> +         ivstep = stride_step;
> +         ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
> +                               build_int_cst (TREE_TYPE (ivstep), vf));
>
> -      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +         standard_iv_increment_position (loop, &incr_gsi, &insert_after);
>
> -      stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
> -      ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
> -      create_iv (stride_base, PLUS_EXPR, ivstep, NULL,
> -                loop, &incr_gsi, insert_after,
> -                &offvar, NULL);
> -      incr = gsi_stmt (incr_gsi);
> +         stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
> +         ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
> +         create_iv (stride_base, PLUS_EXPR, ivstep, NULL, loop, &incr_gsi,
> +                    insert_after, &offvar, NULL);
> +         incr = gsi_stmt (incr_gsi);
>
> -      stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
> +         stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
> +       }
>
>        alias_off = build_int_cst (ref_type, 0);
>        stmt_vec_info next_stmt_info = first_stmt_info;
> @@ -8636,39 +8627,76 @@ vectorizable_store (vec_info *vinfo,
>        for (g = 0; g < group_size; g++)
>         {
>           running_off = offvar;
> -         if (g)
> +         if (!costing_p)
>             {
> -             tree size = TYPE_SIZE_UNIT (ltype);
> -             tree pos = fold_build2 (MULT_EXPR, sizetype, size_int (g),
> -                                     size);
> -             tree newoff = copy_ssa_name (running_off, NULL);
> -             incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
> -                                         running_off, pos);
> -             vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
> -             running_off = newoff;
> +             if (g)
> +               {
> +                 tree size = TYPE_SIZE_UNIT (ltype);
> +                 tree pos
> +                   = fold_build2 (MULT_EXPR, sizetype, size_int (g), size);
> +                 tree newoff = copy_ssa_name (running_off, NULL);
> +                 incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
> +                                             running_off, pos);
> +                 vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
> +                 running_off = newoff;
> +               }
>             }
>           if (!slp)
>             op = vect_get_store_rhs (next_stmt_info);
> -         vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies,
> -                            op, &vec_oprnds);
> +         if (!costing_p)
> +           vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies, op,
> +                              &vec_oprnds);
> +         else if (!slp)
> +           {
> +             enum vect_def_type cdt;
> +             gcc_assert (vect_is_simple_use (op, vinfo, &cdt));
> +             if (cdt == vect_constant_def || cdt == vect_external_def)
> +               prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
> +                                                  stmt_info, 0, vect_prologue);
> +           }
>           unsigned int group_el = 0;
>           unsigned HOST_WIDE_INT
>             elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
>           for (j = 0; j < ncopies; j++)
>             {
> -             vec_oprnd = vec_oprnds[j];
> -             /* Pun the vector to extract from if necessary.  */
> -             if (lvectype != vectype)
> +             if (!costing_p)
>                 {
> -                 tree tem = make_ssa_name (lvectype);
> -                 gimple *pun
> -                   = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR,
> -                                                       lvectype, vec_oprnd));
> -                 vect_finish_stmt_generation (vinfo, stmt_info, pun, gsi);
> -                 vec_oprnd = tem;
> +                 vec_oprnd = vec_oprnds[j];
> +                 /* Pun the vector to extract from if necessary.  */
> +                 if (lvectype != vectype)
> +                   {
> +                     tree tem = make_ssa_name (lvectype);
> +                     tree cvt
> +                       = build1 (VIEW_CONVERT_EXPR, lvectype, vec_oprnd);
> +                     gimple *pun = gimple_build_assign (tem, cvt);
> +                     vect_finish_stmt_generation (vinfo, stmt_info, pun, gsi);
> +                     vec_oprnd = tem;
> +                   }
>                 }
>               for (i = 0; i < nstores; i++)
>                 {
> +                 if (costing_p)
> +                   {
> +                     /* Only need vector extracting when there are more
> +                        than one stores.  */
> +                     if (nstores > 1)
> +                       inside_cost
> +                         += record_stmt_cost (cost_vec, 1, vec_to_scalar,
> +                                              stmt_info, 0, vect_body);
> +                     /* Take a single lane vector type store as scalar
> +                        store to avoid ICE like 110776.  */
> +                     if (VECTOR_TYPE_P (ltype)
> +                         && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
> +                       vect_get_store_cost (vinfo, stmt_info, 1,
> +                                            alignment_support_scheme,
> +                                            misalignment, &inside_cost,
> +                                            cost_vec);
> +                     else
> +                       inside_cost
> +                         += record_stmt_cost (cost_vec, 1, scalar_store,
> +                                              stmt_info, 0, vect_body);
> +                     continue;
> +                   }
>                   tree newref, newoff;
>                   gimple *incr, *assign;
>                   tree size = TYPE_SIZE (ltype);
> @@ -8719,6 +8747,12 @@ vectorizable_store (vec_info *vinfo,
>             break;
>         }
>
> +      if (costing_p && dump_enabled_p ())
> +       dump_printf_loc (MSG_NOTE, vect_location,
> +                        "vect_model_store_cost: inside_cost = %d, "
> +                        "prologue_cost = %d .\n",
> +                        inside_cost, prologue_cost);
> +
>        return true;
>      }
>
> --
> 2.31.1
>
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c
new file mode 100644
index 00000000000..ab5f3301492
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-O3" }
+
+/* This test case is partially extracted from case
+   gcc.dg/vect/vect-avg-16.c, it's to verify we don't
+   cost a store with vec_to_scalar when we shouldn't.  */
+
+void
+test (signed char *restrict a, signed char *restrict b, signed char *restrict c,
+      int n)
+{
+  for (int j = 0; j < n; ++j)
+    {
+      for (int i = 0; i < 16; ++i)
+	a[i] = (b[i] + c[i]) >> 1;
+      a += 20;
+      b += 20;
+      c += 20;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "vec_to_scalar" 0 "vect" } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 048c14d291c..3d01168080a 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -964,7 +964,9 @@  vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies,
 		       vec_load_store_type vls_type, slp_tree slp_node,
 		       stmt_vector_for_cost *cost_vec)
 {
-  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER);
+  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
+	      && memory_access_type != VMAT_ELEMENTWISE
+	      && memory_access_type != VMAT_STRIDED_SLP);
   unsigned int inside_cost = 0, prologue_cost = 0;
   stmt_vec_info first_stmt_info = stmt_info;
   bool grouped_access_p = STMT_VINFO_GROUPED_ACCESS (stmt_info);
@@ -1010,29 +1012,9 @@  vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies,
                          group_size);
     }
 
-  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   /* Costs of the stores.  */
-  if (memory_access_type == VMAT_ELEMENTWISE)
-    {
-      unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
-      /* N scalar stores plus extracting the elements.  */
-      inside_cost += record_stmt_cost (cost_vec,
-				       ncopies * assumed_nunits,
-				       scalar_store, stmt_info, 0, vect_body);
-    }
-  else
-    vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
-			 misalignment, &inside_cost, cost_vec);
-
-  if (memory_access_type == VMAT_ELEMENTWISE
-      || memory_access_type == VMAT_STRIDED_SLP)
-    {
-      /* N scalar stores plus extracting the elements.  */
-      unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
-      inside_cost += record_stmt_cost (cost_vec,
-				       ncopies * assumed_nunits,
-				       vec_to_scalar, stmt_info, 0, vect_body);
-    }
+  vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
+		       misalignment, &inside_cost, cost_vec);
 
   /* When vectorizing a store into the function result assign
      a penalty if the function returns in a multi-register location.
@@ -8416,6 +8398,18 @@  vectorizable_store (vec_info *vinfo,
 			 "Vectorizing an unaligned access.\n");
 
       STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
+
+      /* As function vect_transform_stmt shows, for interleaving stores
+	 the whole chain is vectorized when the last store in the chain
+	 is reached, the other stores in the group are skipped.  So we
+	 want to only cost the last one here, but it's not trivial to
+	 get the last, as it's equivalent to use the first one for
+	 costing, use the first one instead.  */
+      if (grouped_store
+	  && !slp
+	  && first_stmt_info != stmt_info
+	  && memory_access_type == VMAT_ELEMENTWISE)
+	return true;
     }
   gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
 
@@ -8488,14 +8482,7 @@  vectorizable_store (vec_info *vinfo,
   if (memory_access_type == VMAT_ELEMENTWISE
       || memory_access_type == VMAT_STRIDED_SLP)
     {
-      if (costing_p)
-	{
-	  vect_model_store_cost (vinfo, stmt_info, ncopies, memory_access_type,
-				 alignment_support_scheme, misalignment,
-				 vls_type, slp_node, cost_vec);
-	  return true;
-	}
-
+      unsigned inside_cost = 0, prologue_cost = 0;
       gimple_stmt_iterator incr_gsi;
       bool insert_after;
       gimple *incr;
@@ -8503,7 +8490,7 @@  vectorizable_store (vec_info *vinfo,
       tree ivstep;
       tree running_off;
       tree stride_base, stride_step, alias_off;
-      tree vec_oprnd;
+      tree vec_oprnd = NULL_TREE;
       tree dr_offset;
       unsigned int g;
       /* Checked by get_load_store_type.  */
@@ -8609,26 +8596,30 @@  vectorizable_store (vec_info *vinfo,
 		  lnel = const_nunits;
 		  ltype = vectype;
 		  lvectype = vectype;
+		  alignment_support_scheme = dr_align;
+		  misalignment = mis_align;
 		}
 	    }
 	  ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
 	  ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
 	}
 
-      ivstep = stride_step;
-      ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
-			    build_int_cst (TREE_TYPE (ivstep), vf));
+      if (!costing_p)
+	{
+	  ivstep = stride_step;
+	  ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
+				build_int_cst (TREE_TYPE (ivstep), vf));
 
-      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+	  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 
-      stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
-      ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
-      create_iv (stride_base, PLUS_EXPR, ivstep, NULL,
-		 loop, &incr_gsi, insert_after,
-		 &offvar, NULL);
-      incr = gsi_stmt (incr_gsi);
+	  stride_base = cse_and_gimplify_to_preheader (loop_vinfo, stride_base);
+	  ivstep = cse_and_gimplify_to_preheader (loop_vinfo, ivstep);
+	  create_iv (stride_base, PLUS_EXPR, ivstep, NULL, loop, &incr_gsi,
+		     insert_after, &offvar, NULL);
+	  incr = gsi_stmt (incr_gsi);
 
-      stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
+	  stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
+	}
 
       alias_off = build_int_cst (ref_type, 0);
       stmt_vec_info next_stmt_info = first_stmt_info;
@@ -8636,39 +8627,76 @@  vectorizable_store (vec_info *vinfo,
       for (g = 0; g < group_size; g++)
 	{
 	  running_off = offvar;
-	  if (g)
+	  if (!costing_p)
 	    {
-	      tree size = TYPE_SIZE_UNIT (ltype);
-	      tree pos = fold_build2 (MULT_EXPR, sizetype, size_int (g),
-				      size);
-	      tree newoff = copy_ssa_name (running_off, NULL);
-	      incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
-					  running_off, pos);
-	      vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
-	      running_off = newoff;
+	      if (g)
+		{
+		  tree size = TYPE_SIZE_UNIT (ltype);
+		  tree pos
+		    = fold_build2 (MULT_EXPR, sizetype, size_int (g), size);
+		  tree newoff = copy_ssa_name (running_off, NULL);
+		  incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
+					      running_off, pos);
+		  vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
+		  running_off = newoff;
+		}
 	    }
 	  if (!slp)
 	    op = vect_get_store_rhs (next_stmt_info);
-	  vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies,
-			     op, &vec_oprnds);
+	  if (!costing_p)
+	    vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies, op,
+			       &vec_oprnds);
+	  else if (!slp)
+	    {
+	      enum vect_def_type cdt;
+	      gcc_assert (vect_is_simple_use (op, vinfo, &cdt));
+	      if (cdt == vect_constant_def || cdt == vect_external_def)
+		prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
+						   stmt_info, 0, vect_prologue);
+	    }
 	  unsigned int group_el = 0;
 	  unsigned HOST_WIDE_INT
 	    elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
 	  for (j = 0; j < ncopies; j++)
 	    {
-	      vec_oprnd = vec_oprnds[j];
-	      /* Pun the vector to extract from if necessary.  */
-	      if (lvectype != vectype)
+	      if (!costing_p)
 		{
-		  tree tem = make_ssa_name (lvectype);
-		  gimple *pun
-		    = gimple_build_assign (tem, build1 (VIEW_CONVERT_EXPR,
-							lvectype, vec_oprnd));
-		  vect_finish_stmt_generation (vinfo, stmt_info, pun, gsi);
-		  vec_oprnd = tem;
+		  vec_oprnd = vec_oprnds[j];
+		  /* Pun the vector to extract from if necessary.  */
+		  if (lvectype != vectype)
+		    {
+		      tree tem = make_ssa_name (lvectype);
+		      tree cvt
+			= build1 (VIEW_CONVERT_EXPR, lvectype, vec_oprnd);
+		      gimple *pun = gimple_build_assign (tem, cvt);
+		      vect_finish_stmt_generation (vinfo, stmt_info, pun, gsi);
+		      vec_oprnd = tem;
+		    }
 		}
 	      for (i = 0; i < nstores; i++)
 		{
+		  if (costing_p)
+		    {
+		      /* Only need vector extracting when there are more
+			 than one stores.  */
+		      if (nstores > 1)
+			inside_cost
+			  += record_stmt_cost (cost_vec, 1, vec_to_scalar,
+					       stmt_info, 0, vect_body);
+		      /* Take a single lane vector type store as scalar
+			 store to avoid ICE like 110776.  */
+		      if (VECTOR_TYPE_P (ltype)
+			  && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
+			vect_get_store_cost (vinfo, stmt_info, 1,
+					     alignment_support_scheme,
+					     misalignment, &inside_cost,
+					     cost_vec);
+		      else
+			inside_cost
+			  += record_stmt_cost (cost_vec, 1, scalar_store,
+					       stmt_info, 0, vect_body);
+		      continue;
+		    }
 		  tree newref, newoff;
 		  gimple *incr, *assign;
 		  tree size = TYPE_SIZE (ltype);
@@ -8719,6 +8747,12 @@  vectorizable_store (vec_info *vinfo,
 	    break;
 	}
 
+      if (costing_p && dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "vect_model_store_cost: inside_cost = %d, "
+			 "prologue_cost = %d .\n",
+			 inside_cost, prologue_cost);
+
       return true;
     }