@@ -32,5 +32,4 @@ int main()
return 0;
}
-/* ??? XFAILed because we access "excess" elements with the permutation. */
-/* { dg-final { scan-tree-dump "basic block vectorized" "slp2" { target vect_perm xfail { ! { aarch64*-*-* arm*-*-* } } } } } */
+/* { dg-final { scan-tree-dump "basic block vectorized" "slp2" { target vect_perm } } } */
@@ -13,7 +13,8 @@ void foo(void)
b[3] = a[3][0];
}
-/* ??? The profitability check is not reached because we give up on the
- gaps we access earlier. */
+/* ??? Due to the gaps we fall back to scalar loads which makes the
+ vectorization profitable. */
/* { dg-final { scan-tree-dump "not profitable" "slp2" { xfail *-*-* } } } */
-/* { dg-final { scan-tree-dump-times "Basic block will be vectorized" 0 "slp2" } } */
+/* { dg-final { scan-tree-dump-times "BB vectorization with gaps at the end of a load is not supported" 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times "Basic block will be vectorized" 1 "slp2" } } */
@@ -19,7 +19,9 @@ void foo ()
}
/* We may not vectorize the store to x[] as it accesses c out-of bounds
- but we do want to vectorize the other two store groups. */
+ but we do want to vectorize the other two store groups. But we may
+ end up using scalar loads to vectorize the last group. */
/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */
-/* { dg-final { scan-tree-dump-times "x\\\[\[0-1\]\\\] = " 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "BB vectorization with gaps at the end of a load is not supported" 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times " = c\\\[4\\\];" 1 "optimized" } } */
@@ -2043,6 +2043,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
/* Update the vectorization factor based on the SLP decision. */
vect_update_vf_for_slp (loop_vinfo);
+
+ /* Optimize the SLP graph with the vectorization factor fixed. */
+ vect_optimize_slp (loop_vinfo);
}
bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
@@ -1850,6 +1850,9 @@ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
unsigned int lidx;
slp_tree node, load;
+ if (SLP_INSTANCE_LOADS (slp_instn).is_empty ())
+ return false;
+
/* Compare all the permutation sequences to the first one. We know
that at least one load is permuted. */
node = SLP_INSTANCE_LOADS (slp_instn)[0];
@@ -1926,7 +1929,7 @@ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
/* Gather loads in the SLP graph NODE and populate the INST loads array. */
static void
-vect_gather_slp_loads (slp_instance inst, slp_tree node,
+vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
hash_set<slp_tree> &visited)
{
if (visited.add (node))
@@ -1939,14 +1942,14 @@ vect_gather_slp_loads (slp_instance inst, slp_tree node,
stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
&& DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
- SLP_INSTANCE_LOADS (inst).safe_push (node);
+ loads.safe_push (node);
}
else
{
unsigned i;
slp_tree child;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
- vect_gather_slp_loads (inst, child, visited);
+ vect_gather_slp_loads (loads, child, visited);
}
}
@@ -1954,138 +1957,7 @@ static void
vect_gather_slp_loads (slp_instance inst, slp_tree node)
{
hash_set<slp_tree> visited;
- vect_gather_slp_loads (inst, node, visited);
-}
-
-/* Check if the required load permutations in the SLP instance
- SLP_INSTN are supported. */
-
-static bool
-vect_supported_load_permutation_p (slp_instance slp_instn)
-{
- unsigned int i, j, k, next;
- slp_tree node;
-
- if (dump_enabled_p ())
- {
- dump_printf_loc (MSG_NOTE, vect_location, "Load permutation ");
- FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
- if (node->load_permutation.exists ())
- FOR_EACH_VEC_ELT (node->load_permutation, j, next)
- dump_printf (MSG_NOTE, "%d ", next);
- else
- for (k = 0; k < SLP_TREE_SCALAR_STMTS (node).length (); ++k)
- dump_printf (MSG_NOTE, "%d ", k);
- dump_printf (MSG_NOTE, "\n");
- }
-
- /* In case of reduction every load permutation is allowed, since the order
- of the reduction statements is not important (as opposed to the case of
- grouped stores). The only condition we need to check is that all the
- load nodes are of the same size and have the same permutation (and then
- rearrange all the nodes of the SLP instance according to this
- permutation). */
-
- /* Check that all the load nodes are of the same size. */
- /* ??? Can't we assert this? */
- unsigned int group_size
- = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (slp_instn)).length ();
- FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
- if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size)
- return false;
-
- node = SLP_INSTANCE_TREE (slp_instn);
- stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
-
- /* Reduction (there are no data-refs in the root).
- In reduction chain the order of the loads is not important. */
- if (!STMT_VINFO_DATA_REF (stmt_info)
- && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
- vect_attempt_slp_rearrange_stmts (slp_instn);
-
- /* In basic block vectorization we allow any subchain of an interleaving
- chain.
- FORNOW: not supported in loop SLP because of realignment compications. */
- if (STMT_VINFO_BB_VINFO (stmt_info))
- {
- /* Check whether the loads in an instance form a subchain and thus
- no permutation is necessary. */
- FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
- {
- if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
- continue;
- bool subchain_p = true;
- stmt_vec_info next_load_info = NULL;
- stmt_vec_info load_info;
- FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
- {
- if (j != 0
- && (next_load_info != load_info
- || DR_GROUP_GAP (load_info) != 1))
- {
- subchain_p = false;
- break;
- }
- next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
- }
- if (subchain_p)
- SLP_TREE_LOAD_PERMUTATION (node).release ();
- else
- {
- stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0];
- group_info = DR_GROUP_FIRST_ELEMENT (group_info);
- unsigned HOST_WIDE_INT nunits;
- unsigned k, maxk = 0;
- FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
- if (k > maxk)
- maxk = k;
- /* In BB vectorization we may not actually use a loaded vector
- accessing elements in excess of DR_GROUP_SIZE. */
- tree vectype = STMT_VINFO_VECTYPE (group_info);
- if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
- || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "BB vectorization with gaps at the end of "
- "a load is not supported\n");
- return false;
- }
-
- /* Verify the permutation can be generated. */
- vec<tree> tem;
- unsigned n_perms;
- if (!vect_transform_slp_perm_load (node, tem, NULL,
- 1, true, &n_perms))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION,
- vect_location,
- "unsupported load permutation\n");
- return false;
- }
- }
- }
- return true;
- }
-
- /* For loop vectorization verify we can generate the permutation. Be
- conservative about the vectorization factor, there are permutations
- that will use three vector inputs only starting from a specific factor
- and the vectorization factor is not yet final.
- ??? The SLP instance unrolling factor might not be the maximum one. */
- unsigned n_perms;
- poly_uint64 test_vf
- = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn),
- LOOP_VINFO_VECT_FACTOR
- (STMT_VINFO_LOOP_VINFO (stmt_info)));
- FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
- if (node->load_permutation.exists ()
- && !vect_transform_slp_perm_load (node, vNULL, NULL, test_vf,
- true, &n_perms))
- return false;
-
- return true;
+ vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited);
}
@@ -2325,7 +2197,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
"SLP size %u vs. limit %u.\n",
tree_size, max_tree_size);
- /* Compute the load permutation. */
+ /* Check whether any load is possibly permuted. */
slp_tree load_node;
bool loads_permuted = false;
FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node)
@@ -2334,43 +2206,15 @@ vect_analyze_slp_instance (vec_info *vinfo,
continue;
unsigned j;
stmt_vec_info load_info;
- bool this_load_permuted = false;
- stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
- (SLP_TREE_SCALAR_STMTS (load_node)[0]);
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
{
- this_load_permuted = true;
+ loads_permuted = true;
break;
}
- if (!this_load_permuted
- /* The load requires permutation when unrolling exposes
- a gap either because the group is larger than the SLP
- group-size or because there is a gap between the groups. */
- && (known_eq (unrolling_factor, 1U)
- || (group_size == DR_GROUP_SIZE (first_stmt_info)
- && DR_GROUP_GAP (first_stmt_info) == 0)))
- {
- SLP_TREE_LOAD_PERMUTATION (load_node).release ();
- continue;
- }
- loads_permuted = true;
}
- if (loads_permuted)
- {
- if (!vect_supported_load_permutation_p (new_instance))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "Build SLP failed: unsupported load "
- "permutation %G", stmt_info->stmt);
- vect_free_slp_instance (new_instance, false);
- return false;
- }
- }
-
- /* If the loads and stores can be handled with load/store-lan
+ /* If the loads and stores can be handled with load/store-lane
instructions do not generate this SLP instance. */
if (is_a <loop_vec_info> (vinfo)
&& loads_permuted
@@ -2555,9 +2399,93 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
vect_free_slp_tree ((*it).second, false);
delete bst_map;
+ /* Optimize permutations in SLP reductions. */
+ slp_instance instance;
+ FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
+ {
+ slp_tree node = SLP_INSTANCE_TREE (instance);
+ stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
+ /* Reduction (there are no data-refs in the root).
+ In reduction chain the order of the loads is not important. */
+ if (!STMT_VINFO_DATA_REF (stmt_info)
+ && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
+ vect_attempt_slp_rearrange_stmts (instance);
+ }
+
+ /* Gather all loads in the SLP graph. */
+ hash_set<slp_tree> visited;
+ FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
+ vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance),
+ visited);
+
return opt_result::success ();
}
+void
+vect_optimize_slp (vec_info *vinfo)
+{
+ slp_tree node;
+ unsigned i;
+ FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node)
+ {
+ if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
+ continue;
+
+ /* In basic block vectorization we allow any subchain of an interleaving
+ chain.
+ FORNOW: not in loop SLP because of realignment complications. */
+ if (is_a <bb_vec_info> (vinfo))
+ {
+ bool subchain_p = true;
+ stmt_vec_info next_load_info = NULL;
+ stmt_vec_info load_info;
+ unsigned j;
+ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
+ {
+ if (j != 0
+ && (next_load_info != load_info
+ || DR_GROUP_GAP (load_info) != 1))
+ {
+ subchain_p = false;
+ break;
+ }
+ next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
+ }
+ if (subchain_p)
+ {
+ SLP_TREE_LOAD_PERMUTATION (node).release ();
+ continue;
+ }
+ }
+ else
+ {
+ stmt_vec_info load_info;
+ bool this_load_permuted = false;
+ unsigned j;
+ FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
+ if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
+ {
+ this_load_permuted = true;
+ break;
+ }
+ stmt_vec_info first_stmt_info
+ = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
+ if (!this_load_permuted
+ /* The load requires permutation when unrolling exposes
+ a gap either because the group is larger than the SLP
+ group-size or because there is a gap between the groups. */
+ && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a <loop_vec_info> (vinfo)), 1U)
+ || ((SLP_TREE_SCALAR_STMTS (node).length ()
+ == DR_GROUP_SIZE (first_stmt_info))
+ && DR_GROUP_GAP (first_stmt_info) == 0)))
+ {
+ SLP_TREE_LOAD_PERMUTATION (node).release ();
+ continue;
+ }
+ }
+ }
+}
+
/* For each possible SLP instance decide whether to SLP it and calculate overall
unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
@@ -3296,6 +3224,9 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
return false;
}
+ /* Optimize permutations. */
+ vect_optimize_slp (bb_vinfo);
+
vect_record_base_alignments (bb_vinfo);
/* Analyze and verify the alignment of data references and the
@@ -8681,7 +8681,44 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
}
if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
- slp_perm = true;
+ {
+ slp_perm = true;
+
+ if (!loop_vinfo)
+ {
+ /* In BB vectorization we may not actually use a loaded vector
+ accessing elements in excess of DR_GROUP_SIZE. */
+ stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
+ group_info = DR_GROUP_FIRST_ELEMENT (group_info);
+ unsigned HOST_WIDE_INT nunits;
+ unsigned j, k, maxk = 0;
+ FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
+ if (k > maxk)
+ maxk = k;
+ tree vectype = STMT_VINFO_VECTYPE (group_info);
+ if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
+ || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "BB vectorization with gaps at the end of "
+ "a load is not supported\n");
+ return false;
+ }
+ }
+
+ auto_vec<tree> tem;
+ unsigned n_perms;
+ if (!vect_transform_slp_perm_load (slp_node, tem, NULL, vf,
+ true, &n_perms))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+ vect_location,
+ "unsupported load permutation\n");
+ return false;
+ }
+ }
/* Invalidate assumptions made by dependence analysis when vectorization
on the unrolled body effectively re-orders stmts. */
@@ -9765,12 +9802,9 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
if (slp_perm)
{
unsigned n_perms;
- if (!vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf,
- false, &n_perms))
- {
- dr_chain.release ();
- return false;
- }
+ bool ok = vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf,
+ false, &n_perms);
+ gcc_assert (ok);
}
else
{
@@ -324,8 +324,9 @@ public:
/* The mapping of GIMPLE UID to stmt_vec_info. */
vec<stmt_vec_info> stmt_vec_infos;
- /* All SLP instances. */
+ /* The SLP graph. */
auto_vec<slp_instance> slp_instances;
+ auto_vec<slp_tree> slp_loads;
/* Maps base addresses to an innermost_loop_behavior that gives the maximum
known alignment for that base. */
@@ -1854,6 +1855,7 @@ extern void vect_schedule_slp (vec_info *);
extern opt_result vect_analyze_slp (vec_info *, unsigned);
extern bool vect_make_slp_decision (loop_vec_info);
extern void vect_detect_hybrid_slp (loop_vec_info);
+extern void vect_optimize_slp (vec_info *);
extern void vect_get_slp_defs (slp_tree, vec<vec<tree> > *, unsigned n = -1U);
extern bool vect_slp_bb (basic_block);
extern stmt_vec_info vect_find_last_scalar_stmt_in_slp (slp_tree);