Message ID | mptlffqlbat.fsf@arm.com |
---|---|
State | New |
Headers | show |
Series | vect: Fix load costs for SLP permutes | expand |
On Wed, Oct 28, 2020 at 2:39 PM Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > For the following test case (compiled with load/store lanes > disabled locally): > > void > f (uint32_t *restrict x, uint8_t *restrict y, int n) > { > for (int i = 0; i < n; ++i) > { > x[i * 2] = x[i * 2] + y[i * 2]; > x[i * 2 + 1] = x[i * 2 + 1] + y[i * 2]; > } > } > > we have a redundant no-op permute on the x[] load node: > > node 0x4472350 (max_nunits=8, refcnt=2) > stmt 0 _5 = *_4; > stmt 1 _13 = *_12; > load permutation { 0 1 } > > Then, when costing it, we pick a cost of 1, even though we need 4 copies > of the x[] load to match a single y[] load: > > ==> examining statement: _5 = *_4; > Vectorizing an unaligned access. > vect_model_load_cost: unaligned supported by hardware. > vect_model_load_cost: inside_cost = 1, prologue_cost = 0 . > > The problem is that the code only considers the permutation for > the first scalar iteration, rather than for all VF iterations. > > This patch tries to fix that by using similar logic to > vect_transform_slp_perm_load. > > Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install? I wonder if we can instead do this counting in vect_transform_slp_load where we already count the number of permutes. That would avoid the duplication of the "logic". Richard. > Richard > > [-b version also included] > > > gcc/ > * tree-vect-stmts.c (vect_model_load_cost): Use similar logic > to vect_transform_slp_perm_load when counting the number of > loads in a permuted SLP operation. > --- > gcc/tree-vect-stmts.c | 67 ++++++++++++++++++++++++++----------------- > 1 file changed, 41 insertions(+), 26 deletions(-) > > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > index 3575f25241f..6eacd641e6b 100644 > --- a/gcc/tree-vect-stmts.c > +++ b/gcc/tree-vect-stmts.c > @@ -1099,38 +1099,53 @@ vect_model_load_cost (vec_info *vinfo, > stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); > /* Record the cost for the permutation. */ > unsigned n_perms; > - unsigned assumed_nunits > - = vect_nunits_for_cost (STMT_VINFO_VECTYPE (first_stmt_info)); > vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, > vf, true, &n_perms); > inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm, > first_stmt_info, 0, vect_body); > - /* And adjust the number of loads performed. This handles > - redundancies as well as loads that are later dead. */ > - auto_sbitmap perm (DR_GROUP_SIZE (first_stmt_info)); > - bitmap_clear (perm); > - for (unsigned i = 0; > - i < SLP_TREE_LOAD_PERMUTATION (slp_node).length (); ++i) > - bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (slp_node)[i]); > - ncopies = 0; > - bool load_seen = false; > - for (unsigned i = 0; i < DR_GROUP_SIZE (first_stmt_info); ++i) > - { > - if (i % assumed_nunits == 0) > + > + /* And if this is not a simple "load N vectors and then permute each > + vector internally" operation, adjust the number of loads performed. > + This handles redundancies as well as loads that are later dead. */ > + unsigned int nscalars = SLP_TREE_SCALAR_STMTS (slp_node).length (); > + unsigned int dr_group_size = DR_GROUP_SIZE (first_stmt_info); > + tree vectype = STMT_VINFO_VECTYPE (first_stmt_info); > + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); > + if (nscalars != dr_group_size > + || !multiple_p (nunits, nscalars)) > + { > + /* The constant VF and nunits are enforced by > + vect_transform_slp_perm_load for this case. */ > + unsigned int const_nunits = nunits.to_constant (); > + unsigned int in_nlanes = dr_group_size * vf.to_constant (); > + unsigned int out_nlanes = nscalars * vf.to_constant (); > + auto_sbitmap perm (in_nlanes); > + bitmap_clear (perm); > + for (unsigned i = 0; i < out_nlanes; ++i) > + { > + unsigned int iter_num = i / nscalars; > + unsigned int stmt_num = i % nscalars; > + unsigned int in_lane > + = (iter_num * dr_group_size > + + SLP_TREE_LOAD_PERMUTATION (slp_node)[stmt_num]); > + bitmap_set_bit (perm, in_lane); > + } > + ncopies = 0; > + bool load_seen = false; > + for (unsigned i = 0; i < in_nlanes; ++i) > { > - if (load_seen) > - ncopies++; > - load_seen = false; > + if (i % const_nunits == 0) > + { > + if (load_seen) > + ncopies++; > + load_seen = false; > + } > + if (bitmap_bit_p (perm, i)) > + load_seen = true; > } > - if (bitmap_bit_p (perm, i)) > - load_seen = true; > - } > - if (load_seen) > - ncopies++; > - gcc_assert (ncopies > - <= (DR_GROUP_SIZE (first_stmt_info) > - - DR_GROUP_GAP (first_stmt_info) > - + assumed_nunits - 1) / assumed_nunits); > + if (load_seen) > + ncopies++; > + } > } > > /* Grouped loads read all elements in the group at once, >
Richard Biener <richard.guenther@gmail.com> writes: > On Wed, Oct 28, 2020 at 2:39 PM Richard Sandiford via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: >> >> For the following test case (compiled with load/store lanes >> disabled locally): >> >> void >> f (uint32_t *restrict x, uint8_t *restrict y, int n) >> { >> for (int i = 0; i < n; ++i) >> { >> x[i * 2] = x[i * 2] + y[i * 2]; >> x[i * 2 + 1] = x[i * 2 + 1] + y[i * 2]; >> } >> } >> >> we have a redundant no-op permute on the x[] load node: >> >> node 0x4472350 (max_nunits=8, refcnt=2) >> stmt 0 _5 = *_4; >> stmt 1 _13 = *_12; >> load permutation { 0 1 } >> >> Then, when costing it, we pick a cost of 1, even though we need 4 copies >> of the x[] load to match a single y[] load: >> >> ==> examining statement: _5 = *_4; >> Vectorizing an unaligned access. >> vect_model_load_cost: unaligned supported by hardware. >> vect_model_load_cost: inside_cost = 1, prologue_cost = 0 . >> >> The problem is that the code only considers the permutation for >> the first scalar iteration, rather than for all VF iterations. >> >> This patch tries to fix that by using similar logic to >> vect_transform_slp_perm_load. >> >> Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install? > > I wonder if we can instead do this counting in vect_transform_slp_load > where we already count the number of permutes. That would avoid > the duplication of the "logic". Very scathing quotes :-) But yeah, agree that's better. How about this version? Tested as before. Richard gcc/ * tree-vectorizer.h (vect_transform_slp_perm_load): Take an optional extra parameter. * tree-vect-stmts.c (vect_transform_slp_perm_load): Calculate the number of loads as well as the number of permutes, taking the counting loop from... (vect_model_load_cost): ...here. Use the value computed by vect_transform_slp_perm_load for ncopies. --- gcc/tree-vect-slp.c | 39 +++++++++++++++++++++++++++++++++++++-- gcc/tree-vect-stmts.c | 32 ++++---------------------------- gcc/tree-vectorizer.h | 3 ++- 3 files changed, 43 insertions(+), 31 deletions(-) diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index ff3a0c2fd8e..2b959af4c56 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -4827,13 +4827,16 @@ vect_get_slp_defs (vec_info *, /* Generate vector permute statements from a list of loads in DR_CHAIN. If ANALYZE_ONLY is TRUE, only check that it is possible to create valid - permute statements for the SLP node NODE. */ + permute statements for the SLP node NODE. Store the number of vector + permute instructions in *N_PERMS and the number of vector load + instructions in *N_LOADS. */ bool vect_transform_slp_perm_load (vec_info *vinfo, slp_tree node, vec<tree> dr_chain, gimple_stmt_iterator *gsi, poly_uint64 vf, - bool analyze_only, unsigned *n_perms) + bool analyze_only, unsigned *n_perms, + unsigned int *n_loads) { stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; int vec_index = 0; @@ -4885,6 +4888,7 @@ vect_transform_slp_perm_load (vec_info *vinfo, vec_perm_builder mask; unsigned int nelts_to_build; unsigned int nvectors_per_build; + unsigned int in_nlanes; bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info) && multiple_p (nunits, group_size)); if (repeating_p) @@ -4895,6 +4899,7 @@ vect_transform_slp_perm_load (vec_info *vinfo, mask.new_vector (nunits, group_size, 3); nelts_to_build = mask.encoded_nelts (); nvectors_per_build = SLP_TREE_VEC_STMTS (node).length (); + in_nlanes = DR_GROUP_SIZE (stmt_info) * 3; } else { @@ -4906,7 +4911,10 @@ vect_transform_slp_perm_load (vec_info *vinfo, mask.new_vector (const_nunits, const_nunits, 1); nelts_to_build = const_vf * group_size; nvectors_per_build = 1; + in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info); } + auto_sbitmap used_in_lanes (in_nlanes); + bitmap_clear (used_in_lanes); unsigned int count = mask.encoded_nelts (); mask.quick_grow (count); @@ -4918,6 +4926,7 @@ vect_transform_slp_perm_load (vec_info *vinfo, unsigned int stmt_num = j % group_size; unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]); + bitmap_set_bit (used_in_lanes, i); if (repeating_p) { first_vec_index = 0; @@ -5031,6 +5040,32 @@ vect_transform_slp_perm_load (vec_info *vinfo, } } + if (n_loads) + { + if (repeating_p) + *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node); + else + { + /* Enforced above when !repeating_p. */ + unsigned int const_nunits = nunits.to_constant (); + *n_loads = 0; + bool load_seen = false; + for (unsigned i = 0; i < in_nlanes; ++i) + { + if (i % const_nunits == 0) + { + if (load_seen) + *n_loads += 1; + load_seen = false; + } + if (bitmap_bit_p (used_in_lanes, i)) + load_seen = true; + } + if (load_seen) + *n_loads += 1; + } + } + return true; } diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 7f0763f15c4..1a0da0e84cc 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1098,39 +1098,15 @@ vect_model_load_cost (vec_info *vinfo, the first group element not by the first scalar stmt DR. */ stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); /* Record the cost for the permutation. */ - unsigned n_perms; - unsigned assumed_nunits - = vect_nunits_for_cost (STMT_VINFO_VECTYPE (first_stmt_info)); + unsigned n_perms, n_loads; vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, - vf, true, &n_perms); + vf, true, &n_perms, &n_loads); inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm, first_stmt_info, 0, vect_body); + /* And adjust the number of loads performed. This handles redundancies as well as loads that are later dead. */ - auto_sbitmap perm (DR_GROUP_SIZE (first_stmt_info)); - bitmap_clear (perm); - for (unsigned i = 0; - i < SLP_TREE_LOAD_PERMUTATION (slp_node).length (); ++i) - bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (slp_node)[i]); - ncopies = 0; - bool load_seen = false; - for (unsigned i = 0; i < DR_GROUP_SIZE (first_stmt_info); ++i) - { - if (i % assumed_nunits == 0) - { - if (load_seen) - ncopies++; - load_seen = false; - } - if (bitmap_bit_p (perm, i)) - load_seen = true; - } - if (load_seen) - ncopies++; - gcc_assert (ncopies - <= (DR_GROUP_SIZE (first_stmt_info) - - DR_GROUP_GAP (first_stmt_info) - + assumed_nunits - 1) / assumed_nunits); + ncopies = n_loads; } /* Grouped loads read all elements in the group at once, diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 13a02cd0d0c..fbf5291cf06 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1952,7 +1952,8 @@ extern tree cse_and_gimplify_to_preheader (loop_vec_info, tree); extern void vect_free_slp_instance (slp_instance); extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, vec<tree>, gimple_stmt_iterator *, poly_uint64, - bool, unsigned *); + bool, unsigned *, + unsigned * = nullptr); extern bool vect_slp_analyze_operations (vec_info *); extern void vect_schedule_slp (vec_info *, vec<slp_instance>); extern opt_result vect_analyze_slp (vec_info *, unsigned);
On Thu, Oct 29, 2020 at 12:52 PM Richard Sandiford <richard.sandiford@arm.com> wrote: > > Richard Biener <richard.guenther@gmail.com> writes: > > On Wed, Oct 28, 2020 at 2:39 PM Richard Sandiford via Gcc-patches > > <gcc-patches@gcc.gnu.org> wrote: > >> > >> For the following test case (compiled with load/store lanes > >> disabled locally): > >> > >> void > >> f (uint32_t *restrict x, uint8_t *restrict y, int n) > >> { > >> for (int i = 0; i < n; ++i) > >> { > >> x[i * 2] = x[i * 2] + y[i * 2]; > >> x[i * 2 + 1] = x[i * 2 + 1] + y[i * 2]; > >> } > >> } > >> > >> we have a redundant no-op permute on the x[] load node: > >> > >> node 0x4472350 (max_nunits=8, refcnt=2) > >> stmt 0 _5 = *_4; > >> stmt 1 _13 = *_12; > >> load permutation { 0 1 } > >> > >> Then, when costing it, we pick a cost of 1, even though we need 4 copies > >> of the x[] load to match a single y[] load: > >> > >> ==> examining statement: _5 = *_4; > >> Vectorizing an unaligned access. > >> vect_model_load_cost: unaligned supported by hardware. > >> vect_model_load_cost: inside_cost = 1, prologue_cost = 0 . > >> > >> The problem is that the code only considers the permutation for > >> the first scalar iteration, rather than for all VF iterations. > >> > >> This patch tries to fix that by using similar logic to > >> vect_transform_slp_perm_load. > >> > >> Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install? > > > > I wonder if we can instead do this counting in vect_transform_slp_load > > where we already count the number of permutes. That would avoid > > the duplication of the "logic". > > Very scathing quotes :-) > > But yeah, agree that's better. How about this version? Tested as before. OK. Thanks, Richard. > Richard > > > gcc/ > * tree-vectorizer.h (vect_transform_slp_perm_load): Take an > optional extra parameter. > * tree-vect-stmts.c (vect_transform_slp_perm_load): Calculate > the number of loads as well as the number of permutes, taking > the counting loop from... > (vect_model_load_cost): ...here. Use the value computed by > vect_transform_slp_perm_load for ncopies. > --- > gcc/tree-vect-slp.c | 39 +++++++++++++++++++++++++++++++++++++-- > gcc/tree-vect-stmts.c | 32 ++++---------------------------- > gcc/tree-vectorizer.h | 3 ++- > 3 files changed, 43 insertions(+), 31 deletions(-) > > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c > index ff3a0c2fd8e..2b959af4c56 100644 > --- a/gcc/tree-vect-slp.c > +++ b/gcc/tree-vect-slp.c > @@ -4827,13 +4827,16 @@ vect_get_slp_defs (vec_info *, > > /* Generate vector permute statements from a list of loads in DR_CHAIN. > If ANALYZE_ONLY is TRUE, only check that it is possible to create valid > - permute statements for the SLP node NODE. */ > + permute statements for the SLP node NODE. Store the number of vector > + permute instructions in *N_PERMS and the number of vector load > + instructions in *N_LOADS. */ > > bool > vect_transform_slp_perm_load (vec_info *vinfo, > slp_tree node, vec<tree> dr_chain, > gimple_stmt_iterator *gsi, poly_uint64 vf, > - bool analyze_only, unsigned *n_perms) > + bool analyze_only, unsigned *n_perms, > + unsigned int *n_loads) > { > stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; > int vec_index = 0; > @@ -4885,6 +4888,7 @@ vect_transform_slp_perm_load (vec_info *vinfo, > vec_perm_builder mask; > unsigned int nelts_to_build; > unsigned int nvectors_per_build; > + unsigned int in_nlanes; > bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info) > && multiple_p (nunits, group_size)); > if (repeating_p) > @@ -4895,6 +4899,7 @@ vect_transform_slp_perm_load (vec_info *vinfo, > mask.new_vector (nunits, group_size, 3); > nelts_to_build = mask.encoded_nelts (); > nvectors_per_build = SLP_TREE_VEC_STMTS (node).length (); > + in_nlanes = DR_GROUP_SIZE (stmt_info) * 3; > } > else > { > @@ -4906,7 +4911,10 @@ vect_transform_slp_perm_load (vec_info *vinfo, > mask.new_vector (const_nunits, const_nunits, 1); > nelts_to_build = const_vf * group_size; > nvectors_per_build = 1; > + in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info); > } > + auto_sbitmap used_in_lanes (in_nlanes); > + bitmap_clear (used_in_lanes); > > unsigned int count = mask.encoded_nelts (); > mask.quick_grow (count); > @@ -4918,6 +4926,7 @@ vect_transform_slp_perm_load (vec_info *vinfo, > unsigned int stmt_num = j % group_size; > unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) > + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]); > + bitmap_set_bit (used_in_lanes, i); > if (repeating_p) > { > first_vec_index = 0; > @@ -5031,6 +5040,32 @@ vect_transform_slp_perm_load (vec_info *vinfo, > } > } > > + if (n_loads) > + { > + if (repeating_p) > + *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node); > + else > + { > + /* Enforced above when !repeating_p. */ > + unsigned int const_nunits = nunits.to_constant (); > + *n_loads = 0; > + bool load_seen = false; > + for (unsigned i = 0; i < in_nlanes; ++i) > + { > + if (i % const_nunits == 0) > + { > + if (load_seen) > + *n_loads += 1; > + load_seen = false; > + } > + if (bitmap_bit_p (used_in_lanes, i)) > + load_seen = true; > + } > + if (load_seen) > + *n_loads += 1; > + } > + } > + > return true; > } > > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > index 7f0763f15c4..1a0da0e84cc 100644 > --- a/gcc/tree-vect-stmts.c > +++ b/gcc/tree-vect-stmts.c > @@ -1098,39 +1098,15 @@ vect_model_load_cost (vec_info *vinfo, > the first group element not by the first scalar stmt DR. */ > stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); > /* Record the cost for the permutation. */ > - unsigned n_perms; > - unsigned assumed_nunits > - = vect_nunits_for_cost (STMT_VINFO_VECTYPE (first_stmt_info)); > + unsigned n_perms, n_loads; > vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, > - vf, true, &n_perms); > + vf, true, &n_perms, &n_loads); > inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm, > first_stmt_info, 0, vect_body); > + > /* And adjust the number of loads performed. This handles > redundancies as well as loads that are later dead. */ > - auto_sbitmap perm (DR_GROUP_SIZE (first_stmt_info)); > - bitmap_clear (perm); > - for (unsigned i = 0; > - i < SLP_TREE_LOAD_PERMUTATION (slp_node).length (); ++i) > - bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (slp_node)[i]); > - ncopies = 0; > - bool load_seen = false; > - for (unsigned i = 0; i < DR_GROUP_SIZE (first_stmt_info); ++i) > - { > - if (i % assumed_nunits == 0) > - { > - if (load_seen) > - ncopies++; > - load_seen = false; > - } > - if (bitmap_bit_p (perm, i)) > - load_seen = true; > - } > - if (load_seen) > - ncopies++; > - gcc_assert (ncopies > - <= (DR_GROUP_SIZE (first_stmt_info) > - - DR_GROUP_GAP (first_stmt_info) > - + assumed_nunits - 1) / assumed_nunits); > + ncopies = n_loads; > } > > /* Grouped loads read all elements in the group at once, > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index 13a02cd0d0c..fbf5291cf06 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -1952,7 +1952,8 @@ extern tree cse_and_gimplify_to_preheader (loop_vec_info, tree); > extern void vect_free_slp_instance (slp_instance); > extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, vec<tree>, > gimple_stmt_iterator *, poly_uint64, > - bool, unsigned *); > + bool, unsigned *, > + unsigned * = nullptr); > extern bool vect_slp_analyze_operations (vec_info *); > extern void vect_schedule_slp (vec_info *, vec<slp_instance>); > extern opt_result vect_analyze_slp (vec_info *, unsigned); > -- > 2.17.1 >
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 3575f25241f..6eacd641e6b 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1099,38 +1099,53 @@ vect_model_load_cost (vec_info *vinfo, stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); /* Record the cost for the permutation. */ unsigned n_perms; - unsigned assumed_nunits - = vect_nunits_for_cost (STMT_VINFO_VECTYPE (first_stmt_info)); vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf, true, &n_perms); inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm, first_stmt_info, 0, vect_body); - /* And adjust the number of loads performed. This handles - redundancies as well as loads that are later dead. */ - auto_sbitmap perm (DR_GROUP_SIZE (first_stmt_info)); - bitmap_clear (perm); - for (unsigned i = 0; - i < SLP_TREE_LOAD_PERMUTATION (slp_node).length (); ++i) - bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (slp_node)[i]); - ncopies = 0; - bool load_seen = false; - for (unsigned i = 0; i < DR_GROUP_SIZE (first_stmt_info); ++i) - { - if (i % assumed_nunits == 0) + + /* And if this is not a simple "load N vectors and then permute each + vector internally" operation, adjust the number of loads performed. + This handles redundancies as well as loads that are later dead. */ + unsigned int nscalars = SLP_TREE_SCALAR_STMTS (slp_node).length (); + unsigned int dr_group_size = DR_GROUP_SIZE (first_stmt_info); + tree vectype = STMT_VINFO_VECTYPE (first_stmt_info); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); + if (nscalars != dr_group_size + || !multiple_p (nunits, nscalars)) + { + /* The constant VF and nunits are enforced by + vect_transform_slp_perm_load for this case. */ + unsigned int const_nunits = nunits.to_constant (); + unsigned int in_nlanes = dr_group_size * vf.to_constant (); + unsigned int out_nlanes = nscalars * vf.to_constant (); + auto_sbitmap perm (in_nlanes); + bitmap_clear (perm); + for (unsigned i = 0; i < out_nlanes; ++i) + { + unsigned int iter_num = i / nscalars; + unsigned int stmt_num = i % nscalars; + unsigned int in_lane + = (iter_num * dr_group_size + + SLP_TREE_LOAD_PERMUTATION (slp_node)[stmt_num]); + bitmap_set_bit (perm, in_lane); + } + ncopies = 0; + bool load_seen = false; + for (unsigned i = 0; i < in_nlanes; ++i) { - if (load_seen) - ncopies++; - load_seen = false; + if (i % const_nunits == 0) + { + if (load_seen) + ncopies++; + load_seen = false; + } + if (bitmap_bit_p (perm, i)) + load_seen = true; } - if (bitmap_bit_p (perm, i)) - load_seen = true; - } - if (load_seen) - ncopies++; - gcc_assert (ncopies - <= (DR_GROUP_SIZE (first_stmt_info) - - DR_GROUP_GAP (first_stmt_info) - + assumed_nunits - 1) / assumed_nunits); + if (load_seen) + ncopies++; + } } /* Grouped loads read all elements in the group at once,