Message ID | 20131015123225.GO30970@tucnak.zalov.cz |
---|---|
State | New |
Headers | show |
Jakub, Richard, I believe this patch is a good opportunity to improve the vectorization capabilities. I have the following question related to it: whether we plan to treat the #pragma omp simd as a directive to vectorize the underlying loop, hence dropping any assessment regarding profitablity? Regards, Sergos On Tue, Oct 15, 2013 at 4:32 PM, Jakub Jelinek <jakub@redhat.com> wrote: > Hi! > > Especially on i?86/x86_64 if-conversion pass seems to be often > a pessimization, but the vectorization relies on it and without it we can't > vectorize a lot of the loops. > > Here is a prototype of a patch that will by default (unless explicit > -ftree-loop-if-convert) only if-convert loops internally for vectorization, > so the COND_EXPRs actually only appear as VEC_COND_EXPRs in the vectorized > basic blocks, but will not appear if vectorization fails, or in the > scalar loop if vectorization is conditional, or in the prologue or epilogue > loops around the vectorized loop. > > Instead of moving the ifcvt pass inside of the vectorizer, this patch > during ifcvt performs loop versioning depending on a special internal > call, only if the internal call returns true we go to the if-converted > original loop, otherwise the non-if-converted copy of the original loop > is performed. And the vectorizer is taught to fold this internal call > into true resp. false depending on if the loop was vectorized or not, and > vectorizer loop versioning, peeling for alignment and for bound are adjusted > to also copy from the non-if-converted loop rather than if-converted one. > > Besides fixing the various PRs where if-conversion pessimizes code I'd like > to also move forward with this with conditional loads and stores, > http://gcc.gnu.org/ml/gcc-patches/2012-11/msg00202.html > where the if-unconversion approach looked like a failure. > > This patch doesn't yet handle if-converted inner loop in outer loop > vectorization, something on my todo list (so several vect-cond-*.c tests > FAIL because they are no longer vectorized) plus I had to change two > SLP vectorization tests that silently relied on loop if-conversion being > performed to actually optimize the basic block (if the same thing didn't > appear in a loop, it wouldn't be optimized at all). > > On the newly added testcase on x86_64, there are before this patch > 18 scalar conditional moves, with the patch just 2 (both in the checking > routine). > > Comments? > > --- gcc/internal-fn.def.jj 2013-10-11 14:32:57.079909782 +0200 > +++ gcc/internal-fn.def 2013-10-11 17:23:58.705526840 +0200 > @@ -43,3 +43,4 @@ DEF_INTERNAL_FN (STORE_LANES, ECF_CONST > DEF_INTERNAL_FN (GOMP_SIMD_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW) > DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF | ECF_NOTHROW) > DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW) > +DEF_INTERNAL_FN (LOOP_VECTORIZED, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW) > --- gcc/tree-vect-loop-manip.c.jj 2013-09-30 22:13:47.000000000 +0200 > +++ gcc/tree-vect-loop-manip.c 2013-10-15 12:57:54.854970913 +0200 > @@ -374,24 +374,31 @@ LOOP-> loop1 > > static void > slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop, > + struct loop *scalar_loop, > bool is_new_loop, basic_block *new_exit_bb) > { > - gimple orig_phi, new_phi; > + gimple orig_phi, new_phi, scalar_phi = NULL; > gimple update_phi, update_phi2; > tree guard_arg, loop_arg; > basic_block new_merge_bb = guard_edge->dest; > edge e = EDGE_SUCC (new_merge_bb, 0); > basic_block update_bb = e->dest; > basic_block orig_bb = loop->header; > - edge new_exit_e; > + edge new_exit_e, scalar_e = NULL; > tree current_new_name; > - gimple_stmt_iterator gsi_orig, gsi_update; > + gimple_stmt_iterator gsi_orig, gsi_update, gsi_scalar = gsi_none (); > > /* Create new bb between loop and new_merge_bb. */ > *new_exit_bb = split_edge (single_exit (loop)); > > new_exit_e = EDGE_SUCC (*new_exit_bb, 0); > > + if (scalar_loop != NULL && !is_new_loop) > + { > + gsi_scalar = gsi_start_phis (scalar_loop->header); > + scalar_e = EDGE_SUCC (scalar_loop->latch, 0); > + } > + > for (gsi_orig = gsi_start_phis (orig_bb), > gsi_update = gsi_start_phis (update_bb); > !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update); > @@ -401,6 +408,11 @@ slpeel_update_phi_nodes_for_guard1 (edge > tree new_res; > orig_phi = gsi_stmt (gsi_orig); > update_phi = gsi_stmt (gsi_update); > + if (scalar_e != NULL) > + { > + scalar_phi = gsi_stmt (gsi_scalar); > + gsi_next (&gsi_scalar); > + } > > /** 1. Handle new-merge-point phis **/ > > @@ -460,7 +472,13 @@ slpeel_update_phi_nodes_for_guard1 (edge > current_new_name = loop_arg; > else > { > - current_new_name = get_current_def (loop_arg); > + if (scalar_e) > + { > + current_new_name = PHI_ARG_DEF_FROM_EDGE (scalar_phi, scalar_e); > + current_new_name = get_current_def (current_new_name); > + } > + else > + current_new_name = get_current_def (loop_arg); > /* current_def is not available only if the variable does not > change inside the loop, in which case we also don't care > about recording a current_def for it because we won't be > @@ -503,6 +521,7 @@ LOOP-> loop2 > > static void > slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop, > + struct loop *scalar_loop, > bool is_new_loop, basic_block *new_exit_bb) > { > gimple orig_phi, new_phi; > @@ -511,17 +530,23 @@ slpeel_update_phi_nodes_for_guard2 (edge > basic_block new_merge_bb = guard_edge->dest; > edge e = EDGE_SUCC (new_merge_bb, 0); > basic_block update_bb = e->dest; > - edge new_exit_e; > + edge new_exit_e, scalar_e = NULL; > tree orig_def, orig_def_new_name; > tree new_name, new_name2; > tree arg; > - gimple_stmt_iterator gsi; > + gimple_stmt_iterator gsi, gsi_scalar = gsi_none (); > > /* Create new bb between loop and new_merge_bb. */ > *new_exit_bb = split_edge (single_exit (loop)); > > new_exit_e = EDGE_SUCC (*new_exit_bb, 0); > > + if (scalar_loop != NULL) > + { > + scalar_e = single_exit (scalar_loop); > + gsi_scalar = gsi_start_phis (scalar_e->dest); > + } > + > for (gsi = gsi_start_phis (update_bb); !gsi_end_p (gsi); gsi_next (&gsi)) > { > tree new_res; > @@ -532,7 +557,16 @@ slpeel_update_phi_nodes_for_guard2 (edge > out of the loop - the phi arg is a constant. */ > if (TREE_CODE (orig_def) != SSA_NAME) > continue; > - orig_def_new_name = get_current_def (orig_def); > + if (scalar_loop != NULL) > + { > + orig_def_new_name > + = PHI_ARG_DEF_FROM_EDGE (gsi_stmt (gsi_scalar), scalar_e); > + gcc_assert (TREE_CODE (orig_def_new_name) == SSA_NAME); > + orig_def_new_name = get_current_def (orig_def_new_name); > + gsi_next (&gsi_scalar); > + } > + else > + orig_def_new_name = get_current_def (orig_def); > arg = NULL_TREE; > > /** 1. Handle new-merge-point phis **/ > @@ -693,7 +727,8 @@ slpeel_make_loop_iterate_ntimes (struct > on E which is either the entry or exit of LOOP. */ > > struct loop * > -slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e) > +slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, > + struct loop *scalar_loop, edge e) > { > struct loop *new_loop; > basic_block *new_bbs, *bbs; > @@ -707,19 +742,22 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( > if (!at_exit && e != loop_preheader_edge (loop)) > return NULL; > > - bbs = XNEWVEC (basic_block, loop->num_nodes + 1); > - get_loop_body_with_size (loop, bbs, loop->num_nodes); > + if (scalar_loop == NULL) > + scalar_loop = loop; > + > + bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); > + get_loop_body_with_size (scalar_loop, bbs, scalar_loop->num_nodes); > > /* Check whether duplication is possible. */ > - if (!can_copy_bbs_p (bbs, loop->num_nodes)) > + if (!can_copy_bbs_p (bbs, scalar_loop->num_nodes)) > { > free (bbs); > return NULL; > } > > /* Generate new loop structure. */ > - new_loop = duplicate_loop (loop, loop_outer (loop)); > - duplicate_subloops (loop, new_loop); > + new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop)); > + duplicate_subloops (scalar_loop, new_loop); > > exit_dest = exit->dest; > was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS, > @@ -729,35 +767,66 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( > /* Also copy the pre-header, this avoids jumping through hoops to > duplicate the loop entry PHI arguments. Create an empty > pre-header unconditionally for this. */ > - basic_block preheader = split_edge (loop_preheader_edge (loop)); > + basic_block preheader = split_edge (loop_preheader_edge (scalar_loop)); > edge entry_e = single_pred_edge (preheader); > - bbs[loop->num_nodes] = preheader; > - new_bbs = XNEWVEC (basic_block, loop->num_nodes + 1); > + bbs[scalar_loop->num_nodes] = preheader; > + new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); > > - copy_bbs (bbs, loop->num_nodes + 1, new_bbs, > + exit = single_exit (scalar_loop); > + copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs, > &exit, 1, &new_exit, NULL, > e->src, true); > - basic_block new_preheader = new_bbs[loop->num_nodes]; > + exit = single_exit (loop); > + basic_block new_preheader = new_bbs[scalar_loop->num_nodes]; > > - add_phi_args_after_copy (new_bbs, loop->num_nodes + 1, NULL); > + add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL); > > if (at_exit) /* Add the loop copy at exit. */ > { > + if (scalar_loop != loop) > + { > + gimple_stmt_iterator gsi; > + new_exit = redirect_edge_and_branch (new_exit, exit_dest); > + > + for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi); > + gsi_next (&gsi)) > + { > + gimple phi = gsi_stmt (gsi); > + tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e); > + location_t orig_locus > + = gimple_phi_arg_location_from_edge (phi, e); > + > + add_phi_arg (phi, orig_arg, new_exit, orig_locus); > + } > + } > redirect_edge_and_branch_force (e, new_preheader); > flush_pending_stmts (e); > set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src); > if (was_imm_dom) > - set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header); > + set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src); > > /* And remove the non-necessary forwarder again. Keep the other > one so we have a proper pre-header for the loop at the exit edge. */ > - redirect_edge_pred (single_succ_edge (preheader), single_pred (preheader)); > + redirect_edge_pred (single_succ_edge (preheader), > + single_pred (preheader)); > delete_basic_block (preheader); > - set_immediate_dominator (CDI_DOMINATORS, loop->header, > - loop_preheader_edge (loop)->src); > + set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, > + loop_preheader_edge (scalar_loop)->src); > } > else /* Add the copy at entry. */ > { > + if (scalar_loop != loop) > + { > + /* Remove the non-necessary forwarder of scalar_loop again. */ > + redirect_edge_pred (single_succ_edge (preheader), > + single_pred (preheader)); > + delete_basic_block (preheader); > + set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, > + loop_preheader_edge (scalar_loop)->src); > + preheader = split_edge (loop_preheader_edge (loop)); > + entry_e = single_pred_edge (preheader); > + } > + > redirect_edge_and_branch_force (entry_e, new_preheader); > flush_pending_stmts (entry_e); > set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src); > @@ -768,15 +837,39 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( > > /* And remove the non-necessary forwarder again. Keep the other > one so we have a proper pre-header for the loop at the exit edge. */ > - redirect_edge_pred (single_succ_edge (new_preheader), single_pred (new_preheader)); > + redirect_edge_pred (single_succ_edge (new_preheader), > + single_pred (new_preheader)); > delete_basic_block (new_preheader); > set_immediate_dominator (CDI_DOMINATORS, new_loop->header, > loop_preheader_edge (new_loop)->src); > } > > - for (unsigned i = 0; i < loop->num_nodes+1; i++) > + for (unsigned i = 0; i < scalar_loop->num_nodes + 1; i++) > rename_variables_in_bb (new_bbs[i]); > > + if (scalar_loop != loop) > + { > + /* Update new_loop->header PHIs, so that on the preheader > + edge they are the ones from loop rather than scalar_loop. */ > + gimple_stmt_iterator gsi_orig, gsi_new; > + edge orig_e = loop_preheader_edge (loop); > + edge new_e = loop_preheader_edge (new_loop); > + > + for (gsi_orig = gsi_start_phis (loop->header), > + gsi_new = gsi_start_phis (new_loop->header); > + !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new); > + gsi_next (&gsi_orig), gsi_next (&gsi_new)) > + { > + gimple orig_phi = gsi_stmt (gsi_orig); > + gimple new_phi = gsi_stmt (gsi_new); > + tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e); > + location_t orig_locus > + = gimple_phi_arg_location_from_edge (orig_phi, orig_e); > + > + add_phi_arg (new_phi, orig_arg, new_e, orig_locus); > + } > + } > + > free (new_bbs); > free (bbs); > > @@ -1028,8 +1121,8 @@ set_prologue_iterations (basic_block bb_ > FORNOW the resulting code will not be in loop-closed-ssa form. > */ > > -static struct loop* > -slpeel_tree_peel_loop_to_edge (struct loop *loop, > +static struct loop * > +slpeel_tree_peel_loop_to_edge (struct loop *loop, struct loop *scalar_loop, > edge e, tree *first_niters, > tree niters, bool update_first_loop_count, > unsigned int th, bool check_profitability, > @@ -1114,7 +1207,8 @@ slpeel_tree_peel_loop_to_edge (struct lo > orig_exit_bb: > */ > > - if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e))) > + if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, > + e))) > { > loop_loc = find_loop_location (loop); > dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, > @@ -1291,7 +1385,7 @@ slpeel_tree_peel_loop_to_edge (struct lo > inverse_probability (first_guard_probability)); > scale_loop_profile (first_loop, first_guard_probability, > check_profitability && (int)th > bound1 ? th : bound1); > - slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop, > + slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop, scalar_loop, > first_loop == new_loop, > &new_exit_bb); > > @@ -1331,7 +1425,7 @@ slpeel_tree_peel_loop_to_edge (struct lo > bb_after_second_loop, bb_before_first_loop, > inverse_probability (second_guard_probability)); > scale_loop_profile (second_loop, probability_of_second_loop, bound2); > - slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop, > + slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop, scalar_loop, > second_loop == new_loop, &new_exit_bb); > > /* 4. Make first-loop iterate FIRST_NITERS times, if requested. > @@ -1755,6 +1849,7 @@ vect_do_peeling_for_loop_bound (loop_vec > { > tree ni_name, ratio_mult_vf_name; > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); > struct loop *new_loop; > edge update_e; > basic_block preheader; > @@ -1780,11 +1875,12 @@ vect_do_peeling_for_loop_bound (loop_vec > > loop_num = loop->num; > > - new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop), > - &ratio_mult_vf_name, ni_name, false, > - th, check_profitability, > - cond_expr, cond_expr_stmt_list, > - 0, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); > + new_loop > + = slpeel_tree_peel_loop_to_edge (loop, scalar_loop, single_exit (loop), > + &ratio_mult_vf_name, ni_name, false, > + th, check_profitability, > + cond_expr, cond_expr_stmt_list, > + 0, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); > gcc_assert (new_loop); > gcc_assert (loop_num == loop->num); > #ifdef ENABLE_CHECKING > @@ -2017,6 +2113,7 @@ vect_do_peeling_for_alignment (loop_vec_ > unsigned int th, bool check_profitability) > { > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); > tree niters_of_prolog_loop, ni_name; > tree n_iters; > tree wide_prolog_niters; > @@ -2038,11 +2135,11 @@ vect_do_peeling_for_alignment (loop_vec_ > > /* Peel the prolog loop and iterate it niters_of_prolog_loop. */ > new_loop = > - slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop), > + slpeel_tree_peel_loop_to_edge (loop, scalar_loop, > + loop_preheader_edge (loop), > &niters_of_prolog_loop, ni_name, true, > th, check_profitability, NULL_TREE, NULL, > - bound, > - 0); > + bound, 0); > > gcc_assert (new_loop); > #ifdef ENABLE_CHECKING > @@ -2398,6 +2495,7 @@ vect_loop_versioning (loop_vec_info loop > unsigned int th, bool check_profitability) > { > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); > basic_block condition_bb; > gimple_stmt_iterator gsi, cond_exp_gsi; > basic_block merge_bb; > @@ -2433,8 +2531,45 @@ vect_loop_versioning (loop_vec_info loop > gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); > > initialize_original_copy_tables (); > - loop_version (loop, cond_expr, &condition_bb, > - prob, prob, REG_BR_PROB_BASE - prob, true); > + if (scalar_loop) > + { > + edge scalar_e; > + basic_block preheader, scalar_preheader; > + > + /* We don't want to scale SCALAR_LOOP's frequencies, we need to > + scale LOOP's frequencies instead. */ > + loop_version (scalar_loop, cond_expr, &condition_bb, > + prob, REG_BR_PROB_BASE, REG_BR_PROB_BASE - prob, true); > + scale_loop_frequencies (loop, prob, REG_BR_PROB_BASE); > + /* CONDITION_BB was created above SCALAR_LOOP's preheader, > + while we need to move it above LOOP's preheader. */ > + e = loop_preheader_edge (loop); > + scalar_e = loop_preheader_edge (scalar_loop); > + gcc_assert (gimple_seq_empty_p (bb_seq (e->src)) > + && gimple_seq_empty_p (phi_nodes (e->src)) > + && single_pred_p (e->src)); > + gcc_assert (gimple_seq_empty_p (bb_seq (scalar_e->src)) > + && gimple_seq_empty_p (phi_nodes (scalar_e->src)) > + && single_pred_p (scalar_e->src)); > + gcc_assert (single_pred_p (condition_bb)); > + preheader = e->src; > + scalar_preheader = scalar_e->src; > + scalar_e = find_edge (condition_bb, scalar_preheader); > + e = single_pred_edge (preheader); > + redirect_edge_and_branch_force (single_pred_edge (condition_bb), > + scalar_preheader); > + redirect_edge_and_branch_force (scalar_e, preheader); > + redirect_edge_and_branch_force (e, condition_bb); > + set_immediate_dominator (CDI_DOMINATORS, condition_bb, > + single_pred (condition_bb)); > + set_immediate_dominator (CDI_DOMINATORS, scalar_preheader, > + single_pred (scalar_preheader)); > + set_immediate_dominator (CDI_DOMINATORS, preheader, > + condition_bb); > + } > + else > + loop_version (loop, cond_expr, &condition_bb, > + prob, prob, REG_BR_PROB_BASE - prob, true); > > if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOC > && dump_enabled_p ()) > @@ -2457,24 +2592,29 @@ vect_loop_versioning (loop_vec_info loop > basic block (i.e. it has two predecessors). Just in order to simplify > following transformations in the vectorizer, we fix this situation > here by adding a new (empty) block on the exit-edge of the loop, > - with the proper loop-exit phis to maintain loop-closed-form. */ > + with the proper loop-exit phis to maintain loop-closed-form. > + If loop versioning wasn't done from loop, but scalar_loop instead, > + merge_bb will have already just a single successor. */ > > merge_bb = single_exit (loop)->dest; > - gcc_assert (EDGE_COUNT (merge_bb->preds) == 2); > - new_exit_bb = split_edge (single_exit (loop)); > - new_exit_e = single_exit (loop); > - e = EDGE_SUCC (new_exit_bb, 0); > - > - for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi)) > + if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2) > { > - tree new_res; > - orig_phi = gsi_stmt (gsi); > - new_res = copy_ssa_name (PHI_RESULT (orig_phi), NULL); > - new_phi = create_phi_node (new_res, new_exit_bb); > - arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); > - add_phi_arg (new_phi, arg, new_exit_e, > - gimple_phi_arg_location_from_edge (orig_phi, e)); > - adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); > + gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2); > + new_exit_bb = split_edge (single_exit (loop)); > + new_exit_e = single_exit (loop); > + e = EDGE_SUCC (new_exit_bb, 0); > + > + for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi)) > + { > + tree new_res; > + orig_phi = gsi_stmt (gsi); > + new_res = copy_ssa_name (PHI_RESULT (orig_phi), NULL); > + new_phi = create_phi_node (new_res, new_exit_bb); > + arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); > + add_phi_arg (new_phi, arg, new_exit_e, > + gimple_phi_arg_location_from_edge (orig_phi, e)); > + adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); > + } > } > > /* End loop-exit-fixes after versioning. */ > --- gcc/tree-vectorizer.c.jj 2013-10-11 14:32:57.082909767 +0200 > +++ gcc/tree-vectorizer.c 2013-10-14 15:34:19.921860478 +0200 > @@ -306,6 +306,43 @@ vect_destroy_datarefs (loop_vec_info loo > } > > > +/* If LOOP has been versioned during ifcvt, return the internal call > + guarding it. */ > + > +static gimple > +vect_loop_vectorized_call (struct loop *loop) > +{ > + basic_block bb = loop_preheader_edge (loop)->src; > + gimple g; > + do > + { > + g = last_stmt (bb); > + if (g) > + break; > + if (!single_pred_p (bb)) > + break; > + bb = single_pred (bb); > + } > + while (1); > + if (g && gimple_code (g) == GIMPLE_COND) > + { > + gimple_stmt_iterator gsi = gsi_for_stmt (g); > + gsi_prev (&gsi); > + if (!gsi_end_p (gsi)) > + { > + g = gsi_stmt (gsi); > + if (is_gimple_call (g) > + && gimple_call_internal_p (g) > + && gimple_call_internal_fn (g) == IFN_LOOP_VECTORIZED > + && (tree_low_cst (gimple_call_arg (g, 0), 0) == loop->num > + || tree_low_cst (gimple_call_arg (g, 1), 0) == loop->num)) > + return g; > + } > + } > + return NULL; > +} > + > + > /* Function vectorize_loops. > > Entry point to loop vectorization phase. */ > @@ -320,6 +357,8 @@ vectorize_loops (void) > struct loop *loop; > hash_table <simduid_to_vf> simduid_to_vf_htab; > hash_table <simd_array_to_simduid> simd_array_to_simduid_htab; > + bool any_ifcvt_loops = false; > + unsigned ret = 0; > > vect_loops_num = number_of_loops (cfun); > > @@ -342,8 +381,11 @@ vectorize_loops (void) > than all previously defined loops. This fact allows us to run > only over initial loops skipping newly generated ones. */ > FOR_EACH_LOOP (li, loop, 0) > - if ((flag_tree_loop_vectorize && optimize_loop_nest_for_speed_p (loop)) > - || loop->force_vect) > + if (loop->dont_vectorize) > + any_ifcvt_loops = true; > + else if ((flag_tree_loop_vectorize > + && optimize_loop_nest_for_speed_p (loop)) > + || loop->force_vect) > { > loop_vec_info loop_vinfo; > vect_location = find_loop_location (loop); > @@ -361,6 +403,38 @@ vectorize_loops (void) > if (!dbg_cnt (vect_loop)) > break; > > + gimple loop_vectorized_call = vect_loop_vectorized_call (loop); > + if (loop_vectorized_call) > + { > + tree arg = gimple_call_arg (loop_vectorized_call, 1); > + basic_block *bbs; > + unsigned int i; > + struct loop *scalar_loop = get_loop (cfun, tree_low_cst (arg, 0)); > + > + LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = scalar_loop; > + gcc_checking_assert (vect_loop_vectorized_call > + (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) > + == loop_vectorized_call); > + bbs = get_loop_body (scalar_loop); > + for (i = 0; i < scalar_loop->num_nodes; i++) > + { > + basic_block bb = bbs[i]; > + gimple_stmt_iterator gsi; > + for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); > + gsi_next (&gsi)) > + { > + gimple phi = gsi_stmt (gsi); > + gimple_set_uid (phi, 0); > + } > + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); > + gsi_next (&gsi)) > + { > + gimple stmt = gsi_stmt (gsi); > + gimple_set_uid (stmt, 0); > + } > + } > + free (bbs); > + } > if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOC > && dump_enabled_p ()) > dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, > @@ -381,6 +455,25 @@ vectorize_loops (void) > *simduid_to_vf_htab.find_slot (simduid_to_vf_data, INSERT) > = simduid_to_vf_data; > } > + > + if (loop_vectorized_call) > + { > + gimple g = loop_vectorized_call; > + tree lhs = gimple_call_lhs (g); > + gimple_stmt_iterator gsi = gsi_for_stmt (g); > + gimplify_and_update_call_from_tree (&gsi, boolean_true_node); > + gsi_next (&gsi); > + if (!gsi_end_p (gsi)) > + { > + g = gsi_stmt (gsi); > + if (gimple_code (g) == GIMPLE_COND > + && gimple_cond_lhs (g) == lhs) > + { > + gimple_cond_set_lhs (g, boolean_true_node); > + update_stmt (g); > + } > + } > + } > } > > vect_location = UNKNOWN_LOC; > @@ -394,6 +487,34 @@ vectorize_loops (void) > > /* ----------- Finalize. ----------- */ > > + if (any_ifcvt_loops) > + for (i = 1; i < vect_loops_num; i++) > + { > + loop = get_loop (cfun, i); > + if (loop && loop->dont_vectorize) > + { > + gimple g = vect_loop_vectorized_call (loop); > + if (g) > + { > + tree lhs = gimple_call_lhs (g); > + gimple_stmt_iterator gsi = gsi_for_stmt (g); > + gimplify_and_update_call_from_tree (&gsi, boolean_false_node); > + gsi_next (&gsi); > + if (!gsi_end_p (gsi)) > + { > + g = gsi_stmt (gsi); > + if (gimple_code (g) == GIMPLE_COND > + && gimple_cond_lhs (g) == lhs) > + { > + gimple_cond_set_lhs (g, boolean_false_node); > + update_stmt (g); > + } > + } > + ret = TODO_cleanup_cfg; > + } > + } > + } > + > for (i = 1; i < vect_loops_num; i++) > { > loop_vec_info loop_vinfo; > @@ -451,7 +572,7 @@ vectorize_loops (void) > return TODO_cleanup_cfg; > } > > - return 0; > + return ret; > } > > > --- gcc/tree-vectorizer.h.jj 2013-10-11 14:32:57.086909746 +0200 > +++ gcc/tree-vectorizer.h 2013-10-14 14:32:55.538688209 +0200 > @@ -314,6 +314,10 @@ typedef struct _loop_vec_info { > fix it up. */ > bool operands_swapped; > > + /* If if-conversion versioned this loop before conversion, this is the > + loop version without if-conversion. */ > + struct loop *scalar_loop; > + > } *loop_vec_info; > > /* Access Functions. */ > @@ -345,6 +349,7 @@ typedef struct _loop_vec_info { > #define LOOP_VINFO_TARGET_COST_DATA(L) (L)->target_cost_data > #define LOOP_VINFO_PEELING_FOR_GAPS(L) (L)->peeling_for_gaps > #define LOOP_VINFO_OPERANDS_SWAPPED(L) (L)->operands_swapped > +#define LOOP_VINFO_SCALAR_LOOP(L) (L)->scalar_loop > > #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \ > (L)->may_misalign_stmts.length () > 0 > @@ -899,7 +904,8 @@ extern LOC vect_location; > in tree-vect-loop-manip.c. */ > extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree); > extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge); > -struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, edge); > +struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, > + struct loop *, edge); > extern void vect_loop_versioning (loop_vec_info, unsigned int, bool); > extern void vect_do_peeling_for_loop_bound (loop_vec_info, tree *, > unsigned int, bool); > --- gcc/cfgloop.h.jj 2013-10-11 14:32:57.089909730 +0200 > +++ gcc/cfgloop.h 2013-10-11 17:23:58.706526905 +0200 > @@ -177,6 +177,9 @@ struct GTY ((chain_next ("%h.next"))) lo > /* True if we should try harder to vectorize this loop. */ > bool force_vect; > > + /* True if this loop should never be vectorized. */ > + bool dont_vectorize; > + > /* For SIMD loops, this is a unique identifier of the loop, referenced > by IFN_GOMP_SIMD_VF, IFN_GOMP_SIMD_LANE and IFN_GOMP_SIMD_LAST_LANE > builtins. */ > --- gcc/tree-loop-distribution.c.jj 2013-10-07 15:06:40.000000000 +0200 > +++ gcc/tree-loop-distribution.c 2013-10-14 14:33:22.448549212 +0200 > @@ -673,7 +673,7 @@ copy_loop_before (struct loop *loop) > edge preheader = loop_preheader_edge (loop); > > initialize_original_copy_tables (); > - res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader); > + res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader); > gcc_assert (res != NULL); > free_original_copy_tables (); > delete_update_ssa (); > --- gcc/internal-fn.c.jj 2013-10-11 14:32:57.092909715 +0200 > +++ gcc/internal-fn.c 2013-10-11 17:23:58.706526905 +0200 > @@ -133,6 +133,14 @@ expand_GOMP_SIMD_LAST_LANE (gimple stmt > gcc_unreachable (); > } > > +/* This should get folded in tree-vectorizer.c. */ > + > +static void > +expand_LOOP_VECTORIZED (gimple stmt ATTRIBUTE_UNUSED) > +{ > + gcc_unreachable (); > +} > + > /* Routines to expand each internal function, indexed by function number. > Each routine has the prototype: > > --- gcc/tree-if-conv.c.jj 2013-10-11 14:32:57.095909699 +0200 > +++ gcc/tree-if-conv.c 2013-10-11 17:23:58.707526969 +0200 > @@ -1735,6 +1735,48 @@ combine_blocks (struct loop *loop) > ifc_bbs = NULL; > } > > +static bool > +version_loop_for_if_conversion (struct loop *loop) > +{ > + basic_block cond_bb; > + tree cond = make_ssa_name (boolean_type_node, NULL); > + struct loop *new_loop; > + gimple g; > + gimple_stmt_iterator gsi; > + void **aux = XNEWVEC (void *, loop->num_nodes); > + unsigned int i; > + > + /* We have data stored in bb->aux, but loop_version also > + uses it, so save it temporarily and restore after loop_version. */ > + for (i = 0; i < loop->num_nodes; i++) > + { > + aux[i] = ifc_bbs[i]->aux; > + ifc_bbs[i]->aux = NULL; > + } > + g = gimple_build_call_internal (IFN_LOOP_VECTORIZED, 2, > + build_int_cst (integer_type_node, loop->num), > + integer_zero_node); > + gimple_call_set_lhs (g, cond); > + > + initialize_original_copy_tables (); > + new_loop = loop_version (loop, cond, &cond_bb, > + REG_BR_PROB_BASE, REG_BR_PROB_BASE, > + REG_BR_PROB_BASE, true); > + free_original_copy_tables (); > + for (i = 0; i < loop->num_nodes; i++) > + ifc_bbs[i]->aux = aux[i]; > + XDELETEVEC (aux); > + if (new_loop == NULL) > + return false; > + new_loop->dont_vectorize = true; > + new_loop->force_vect = false; > + gsi = gsi_last_bb (cond_bb); > + gimple_call_set_arg (g, 1, build_int_cst (integer_type_node, new_loop->num)); > + gsi_insert_before (&gsi, g, GSI_SAME_STMT); > + update_ssa (TODO_update_ssa); > + return true; > +} > + > /* If-convert LOOP when it is legal. For the moment this pass has no > profitability analysis. Returns true when something changed. */ > > @@ -1744,10 +1786,18 @@ tree_if_conversion (struct loop *loop) > bool changed = false; > ifc_bbs = NULL; > > + if (loop->dont_vectorize) > + goto cleanup; > + > if (!if_convertible_loop_p (loop) > || !dbg_cnt (if_conversion_tree)) > goto cleanup; > > + if ((flag_tree_loop_vectorize || loop->force_vect) > + && flag_tree_loop_if_convert == -1 > + && !version_loop_for_if_conversion (loop)) > + goto cleanup; > + > /* Now all statements are if-convertible. Combine all the basic > blocks into one huge basic block doing the if-conversion > on-the-fly. */ > --- gcc/testsuite/gcc.dg/vect/vect-cond-11.c.jj 2013-10-15 14:01:07.877814190 +0200 > +++ gcc/testsuite/gcc.dg/vect/vect-cond-11.c 2013-10-15 14:02:29.302414970 +0200 > @@ -0,0 +1,116 @@ > +#include "tree-vect.h" > + > +#define N 1024 > +typedef int V __attribute__((vector_size (4))); > +unsigned int a[N * 2] __attribute__((aligned)); > +unsigned int b[N * 2] __attribute__((aligned)); > +V c[N]; > + > +__attribute__((noinline, noclone)) unsigned int > +foo (unsigned int *a, unsigned int *b) > +{ > + int i; > + unsigned int r = 0; > + for (i = 0; i < N; i++) > + { > + unsigned int x = a[i], y = b[i]; > + if (x < 32) > + { > + x = x + 127; > + y = y * 2; > + } > + else > + { > + x = x - 16; > + y = y + 1; > + } > + a[i] = x; > + b[i] = y; > + r += x; > + } > + return r; > +} > + > +__attribute__((noinline, noclone)) unsigned int > +bar (unsigned int *a, unsigned int *b) > +{ > + int i; > + unsigned int r = 0; > + for (i = 0; i < N; i++) > + { > + unsigned int x = a[i], y = b[i]; > + if (x < 32) > + { > + x = x + 127; > + y = y * 2; > + } > + else > + { > + x = x - 16; > + y = y + 1; > + } > + a[i] = x; > + b[i] = y; > + c[i] = c[i] + 1; > + r += x; > + } > + return r; > +} > + > +void > +baz (unsigned int *a, unsigned int *b, > + unsigned int (*fn) (unsigned int *, unsigned int *)) > +{ > + int i; > + for (i = -64; i < 0; i++) > + { > + a[i] = 19; > + b[i] = 17; > + } > + for (; i < N; i++) > + { > + a[i] = i - 512; > + b[i] = i; > + } > + for (; i < N + 64; i++) > + { > + a[i] = 27; > + b[i] = 19; > + } > + if (fn (a, b) != -512U - (N - 32) * 16U + 32 * 127U) > + __builtin_abort (); > + for (i = -64; i < 0; i++) > + if (a[i] != 19 || b[i] != 17) > + __builtin_abort (); > + for (; i < N; i++) > + if (a[i] != (i - 512U < 32U ? i - 512U + 127 : i - 512U - 16) > + || b[i] != (i - 512U < 32U ? i * 2U : i + 1U)) > + __builtin_abort (); > + for (; i < N + 64; i++) > + if (a[i] != 27 || b[i] != 19) > + __builtin_abort (); > +} > + > +int > +main () > +{ > + int i; > + check_vect (); > + baz (a + 512, b + 512, foo); > + baz (a + 512, b + 512, bar); > + baz (a + 512 + 1, b + 512 + 1, foo); > + baz (a + 512 + 1, b + 512 + 1, bar); > + baz (a + 512 + 31, b + 512 + 31, foo); > + baz (a + 512 + 31, b + 512 + 31, bar); > + baz (a + 512 + 1, b + 512, foo); > + baz (a + 512 + 1, b + 512, bar); > + baz (a + 512 + 31, b + 512, foo); > + baz (a + 512 + 31, b + 512, bar); > + baz (a + 512, b + 512 + 1, foo); > + baz (a + 512, b + 512 + 1, bar); > + baz (a + 512, b + 512 + 31, foo); > + baz (a + 512, b + 512 + 31, bar); > + return 0; > +} > + > +/* { dg-final { cleanup-tree-dump "vect" } } */ > --- gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c.jj 2013-08-30 14:38:40.000000000 +0200 > +++ gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c 2013-10-14 13:19:21.704256653 +0200 > @@ -1,4 +1,5 @@ > /* { dg-require-effective-target vect_condition } */ > +/* { dg-additional-options "-ftree-loop-if-convert" } */ > > #include "tree-vect.h" > > --- gcc/testsuite/gcc.dg/vect/bb-slp-pattern-2.c.jj 2013-08-30 14:38:40.000000000 +0200 > +++ gcc/testsuite/gcc.dg/vect/bb-slp-pattern-2.c 2013-10-14 13:19:35.678195952 +0200 > @@ -1,4 +1,5 @@ > /* { dg-require-effective-target vect_condition } */ > +/* { dg-additional-options "-ftree-loop-if-convert" } */ > > #include "tree-vect.h" > > > Jakub
> On Oct 15, 2013, at 5:32 AM, Jakub Jelinek <jakub@redhat.com> wrote: > > Hi! > > Especially on i?86/x86_64 if-conversion pass seems to be often > a pessimization, but the vectorization relies on it and without it we can't > vectorize a lot of the loops. I think on many other targets it actually helps. I know for one it helps on octeon even though octeon has no vector instructions. I think it helps most arm targets too. Thanks, Andrew > > Here is a prototype of a patch that will by default (unless explicit > -ftree-loop-if-convert) only if-convert loops internally for vectorization, > so the COND_EXPRs actually only appear as VEC_COND_EXPRs in the vectorized > basic blocks, but will not appear if vectorization fails, or in the > scalar loop if vectorization is conditional, or in the prologue or epilogue > loops around the vectorized loop. > > Instead of moving the ifcvt pass inside of the vectorizer, this patch > during ifcvt performs loop versioning depending on a special internal > call, only if the internal call returns true we go to the if-converted > original loop, otherwise the non-if-converted copy of the original loop > is performed. And the vectorizer is taught to fold this internal call > into true resp. false depending on if the loop was vectorized or not, and > vectorizer loop versioning, peeling for alignment and for bound are adjusted > to also copy from the non-if-converted loop rather than if-converted one. > > Besides fixing the various PRs where if-conversion pessimizes code I'd like > to also move forward with this with conditional loads and stores, > http://gcc.gnu.org/ml/gcc-patches/2012-11/msg00202.html > where the if-unconversion approach looked like a failure. > > This patch doesn't yet handle if-converted inner loop in outer loop > vectorization, something on my todo list (so several vect-cond-*.c tests > FAIL because they are no longer vectorized) plus I had to change two > SLP vectorization tests that silently relied on loop if-conversion being > performed to actually optimize the basic block (if the same thing didn't > appear in a loop, it wouldn't be optimized at all). > > On the newly added testcase on x86_64, there are before this patch > 18 scalar conditional moves, with the patch just 2 (both in the checking > routine). > > Comments? > > --- gcc/internal-fn.def.jj 2013-10-11 14:32:57.079909782 +0200 > +++ gcc/internal-fn.def 2013-10-11 17:23:58.705526840 +0200 > @@ -43,3 +43,4 @@ DEF_INTERNAL_FN (STORE_LANES, ECF_CONST > DEF_INTERNAL_FN (GOMP_SIMD_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW) > DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF | ECF_NOTHROW) > DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW) > +DEF_INTERNAL_FN (LOOP_VECTORIZED, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW) > --- gcc/tree-vect-loop-manip.c.jj 2013-09-30 22:13:47.000000000 +0200 > +++ gcc/tree-vect-loop-manip.c 2013-10-15 12:57:54.854970913 +0200 > @@ -374,24 +374,31 @@ LOOP-> loop1 > > static void > slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop, > + struct loop *scalar_loop, > bool is_new_loop, basic_block *new_exit_bb) > { > - gimple orig_phi, new_phi; > + gimple orig_phi, new_phi, scalar_phi = NULL; > gimple update_phi, update_phi2; > tree guard_arg, loop_arg; > basic_block new_merge_bb = guard_edge->dest; > edge e = EDGE_SUCC (new_merge_bb, 0); > basic_block update_bb = e->dest; > basic_block orig_bb = loop->header; > - edge new_exit_e; > + edge new_exit_e, scalar_e = NULL; > tree current_new_name; > - gimple_stmt_iterator gsi_orig, gsi_update; > + gimple_stmt_iterator gsi_orig, gsi_update, gsi_scalar = gsi_none (); > > /* Create new bb between loop and new_merge_bb. */ > *new_exit_bb = split_edge (single_exit (loop)); > > new_exit_e = EDGE_SUCC (*new_exit_bb, 0); > > + if (scalar_loop != NULL && !is_new_loop) > + { > + gsi_scalar = gsi_start_phis (scalar_loop->header); > + scalar_e = EDGE_SUCC (scalar_loop->latch, 0); > + } > + > for (gsi_orig = gsi_start_phis (orig_bb), > gsi_update = gsi_start_phis (update_bb); > !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update); > @@ -401,6 +408,11 @@ slpeel_update_phi_nodes_for_guard1 (edge > tree new_res; > orig_phi = gsi_stmt (gsi_orig); > update_phi = gsi_stmt (gsi_update); > + if (scalar_e != NULL) > + { > + scalar_phi = gsi_stmt (gsi_scalar); > + gsi_next (&gsi_scalar); > + } > > /** 1. Handle new-merge-point phis **/ > > @@ -460,7 +472,13 @@ slpeel_update_phi_nodes_for_guard1 (edge > current_new_name = loop_arg; > else > { > - current_new_name = get_current_def (loop_arg); > + if (scalar_e) > + { > + current_new_name = PHI_ARG_DEF_FROM_EDGE (scalar_phi, scalar_e); > + current_new_name = get_current_def (current_new_name); > + } > + else > + current_new_name = get_current_def (loop_arg); > /* current_def is not available only if the variable does not > change inside the loop, in which case we also don't care > about recording a current_def for it because we won't be > @@ -503,6 +521,7 @@ LOOP-> loop2 > > static void > slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop, > + struct loop *scalar_loop, > bool is_new_loop, basic_block *new_exit_bb) > { > gimple orig_phi, new_phi; > @@ -511,17 +530,23 @@ slpeel_update_phi_nodes_for_guard2 (edge > basic_block new_merge_bb = guard_edge->dest; > edge e = EDGE_SUCC (new_merge_bb, 0); > basic_block update_bb = e->dest; > - edge new_exit_e; > + edge new_exit_e, scalar_e = NULL; > tree orig_def, orig_def_new_name; > tree new_name, new_name2; > tree arg; > - gimple_stmt_iterator gsi; > + gimple_stmt_iterator gsi, gsi_scalar = gsi_none (); > > /* Create new bb between loop and new_merge_bb. */ > *new_exit_bb = split_edge (single_exit (loop)); > > new_exit_e = EDGE_SUCC (*new_exit_bb, 0); > > + if (scalar_loop != NULL) > + { > + scalar_e = single_exit (scalar_loop); > + gsi_scalar = gsi_start_phis (scalar_e->dest); > + } > + > for (gsi = gsi_start_phis (update_bb); !gsi_end_p (gsi); gsi_next (&gsi)) > { > tree new_res; > @@ -532,7 +557,16 @@ slpeel_update_phi_nodes_for_guard2 (edge > out of the loop - the phi arg is a constant. */ > if (TREE_CODE (orig_def) != SSA_NAME) > continue; > - orig_def_new_name = get_current_def (orig_def); > + if (scalar_loop != NULL) > + { > + orig_def_new_name > + = PHI_ARG_DEF_FROM_EDGE (gsi_stmt (gsi_scalar), scalar_e); > + gcc_assert (TREE_CODE (orig_def_new_name) == SSA_NAME); > + orig_def_new_name = get_current_def (orig_def_new_name); > + gsi_next (&gsi_scalar); > + } > + else > + orig_def_new_name = get_current_def (orig_def); > arg = NULL_TREE; > > /** 1. Handle new-merge-point phis **/ > @@ -693,7 +727,8 @@ slpeel_make_loop_iterate_ntimes (struct > on E which is either the entry or exit of LOOP. */ > > struct loop * > -slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e) > +slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, > + struct loop *scalar_loop, edge e) > { > struct loop *new_loop; > basic_block *new_bbs, *bbs; > @@ -707,19 +742,22 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( > if (!at_exit && e != loop_preheader_edge (loop)) > return NULL; > > - bbs = XNEWVEC (basic_block, loop->num_nodes + 1); > - get_loop_body_with_size (loop, bbs, loop->num_nodes); > + if (scalar_loop == NULL) > + scalar_loop = loop; > + > + bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); > + get_loop_body_with_size (scalar_loop, bbs, scalar_loop->num_nodes); > > /* Check whether duplication is possible. */ > - if (!can_copy_bbs_p (bbs, loop->num_nodes)) > + if (!can_copy_bbs_p (bbs, scalar_loop->num_nodes)) > { > free (bbs); > return NULL; > } > > /* Generate new loop structure. */ > - new_loop = duplicate_loop (loop, loop_outer (loop)); > - duplicate_subloops (loop, new_loop); > + new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop)); > + duplicate_subloops (scalar_loop, new_loop); > > exit_dest = exit->dest; > was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS, > @@ -729,35 +767,66 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( > /* Also copy the pre-header, this avoids jumping through hoops to > duplicate the loop entry PHI arguments. Create an empty > pre-header unconditionally for this. */ > - basic_block preheader = split_edge (loop_preheader_edge (loop)); > + basic_block preheader = split_edge (loop_preheader_edge (scalar_loop)); > edge entry_e = single_pred_edge (preheader); > - bbs[loop->num_nodes] = preheader; > - new_bbs = XNEWVEC (basic_block, loop->num_nodes + 1); > + bbs[scalar_loop->num_nodes] = preheader; > + new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); > > - copy_bbs (bbs, loop->num_nodes + 1, new_bbs, > + exit = single_exit (scalar_loop); > + copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs, > &exit, 1, &new_exit, NULL, > e->src, true); > - basic_block new_preheader = new_bbs[loop->num_nodes]; > + exit = single_exit (loop); > + basic_block new_preheader = new_bbs[scalar_loop->num_nodes]; > > - add_phi_args_after_copy (new_bbs, loop->num_nodes + 1, NULL); > + add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL); > > if (at_exit) /* Add the loop copy at exit. */ > { > + if (scalar_loop != loop) > + { > + gimple_stmt_iterator gsi; > + new_exit = redirect_edge_and_branch (new_exit, exit_dest); > + > + for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi); > + gsi_next (&gsi)) > + { > + gimple phi = gsi_stmt (gsi); > + tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e); > + location_t orig_locus > + = gimple_phi_arg_location_from_edge (phi, e); > + > + add_phi_arg (phi, orig_arg, new_exit, orig_locus); > + } > + } > redirect_edge_and_branch_force (e, new_preheader); > flush_pending_stmts (e); > set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src); > if (was_imm_dom) > - set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header); > + set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src); > > /* And remove the non-necessary forwarder again. Keep the other > one so we have a proper pre-header for the loop at the exit edge. */ > - redirect_edge_pred (single_succ_edge (preheader), single_pred (preheader)); > + redirect_edge_pred (single_succ_edge (preheader), > + single_pred (preheader)); > delete_basic_block (preheader); > - set_immediate_dominator (CDI_DOMINATORS, loop->header, > - loop_preheader_edge (loop)->src); > + set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, > + loop_preheader_edge (scalar_loop)->src); > } > else /* Add the copy at entry. */ > { > + if (scalar_loop != loop) > + { > + /* Remove the non-necessary forwarder of scalar_loop again. */ > + redirect_edge_pred (single_succ_edge (preheader), > + single_pred (preheader)); > + delete_basic_block (preheader); > + set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, > + loop_preheader_edge (scalar_loop)->src); > + preheader = split_edge (loop_preheader_edge (loop)); > + entry_e = single_pred_edge (preheader); > + } > + > redirect_edge_and_branch_force (entry_e, new_preheader); > flush_pending_stmts (entry_e); > set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src); > @@ -768,15 +837,39 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( > > /* And remove the non-necessary forwarder again. Keep the other > one so we have a proper pre-header for the loop at the exit edge. */ > - redirect_edge_pred (single_succ_edge (new_preheader), single_pred (new_preheader)); > + redirect_edge_pred (single_succ_edge (new_preheader), > + single_pred (new_preheader)); > delete_basic_block (new_preheader); > set_immediate_dominator (CDI_DOMINATORS, new_loop->header, > loop_preheader_edge (new_loop)->src); > } > > - for (unsigned i = 0; i < loop->num_nodes+1; i++) > + for (unsigned i = 0; i < scalar_loop->num_nodes + 1; i++) > rename_variables_in_bb (new_bbs[i]); > > + if (scalar_loop != loop) > + { > + /* Update new_loop->header PHIs, so that on the preheader > + edge they are the ones from loop rather than scalar_loop. */ > + gimple_stmt_iterator gsi_orig, gsi_new; > + edge orig_e = loop_preheader_edge (loop); > + edge new_e = loop_preheader_edge (new_loop); > + > + for (gsi_orig = gsi_start_phis (loop->header), > + gsi_new = gsi_start_phis (new_loop->header); > + !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new); > + gsi_next (&gsi_orig), gsi_next (&gsi_new)) > + { > + gimple orig_phi = gsi_stmt (gsi_orig); > + gimple new_phi = gsi_stmt (gsi_new); > + tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e); > + location_t orig_locus > + = gimple_phi_arg_location_from_edge (orig_phi, orig_e); > + > + add_phi_arg (new_phi, orig_arg, new_e, orig_locus); > + } > + } > + > free (new_bbs); > free (bbs); > > @@ -1028,8 +1121,8 @@ set_prologue_iterations (basic_block bb_ > FORNOW the resulting code will not be in loop-closed-ssa form. > */ > > -static struct loop* > -slpeel_tree_peel_loop_to_edge (struct loop *loop, > +static struct loop * > +slpeel_tree_peel_loop_to_edge (struct loop *loop, struct loop *scalar_loop, > edge e, tree *first_niters, > tree niters, bool update_first_loop_count, > unsigned int th, bool check_profitability, > @@ -1114,7 +1207,8 @@ slpeel_tree_peel_loop_to_edge (struct lo > orig_exit_bb: > */ > > - if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e))) > + if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, > + e))) > { > loop_loc = find_loop_location (loop); > dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, > @@ -1291,7 +1385,7 @@ slpeel_tree_peel_loop_to_edge (struct lo > inverse_probability (first_guard_probability)); > scale_loop_profile (first_loop, first_guard_probability, > check_profitability && (int)th > bound1 ? th : bound1); > - slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop, > + slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop, scalar_loop, > first_loop == new_loop, > &new_exit_bb); > > @@ -1331,7 +1425,7 @@ slpeel_tree_peel_loop_to_edge (struct lo > bb_after_second_loop, bb_before_first_loop, > inverse_probability (second_guard_probability)); > scale_loop_profile (second_loop, probability_of_second_loop, bound2); > - slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop, > + slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop, scalar_loop, > second_loop == new_loop, &new_exit_bb); > > /* 4. Make first-loop iterate FIRST_NITERS times, if requested. > @@ -1755,6 +1849,7 @@ vect_do_peeling_for_loop_bound (loop_vec > { > tree ni_name, ratio_mult_vf_name; > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); > struct loop *new_loop; > edge update_e; > basic_block preheader; > @@ -1780,11 +1875,12 @@ vect_do_peeling_for_loop_bound (loop_vec > > loop_num = loop->num; > > - new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop), > - &ratio_mult_vf_name, ni_name, false, > - th, check_profitability, > - cond_expr, cond_expr_stmt_list, > - 0, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); > + new_loop > + = slpeel_tree_peel_loop_to_edge (loop, scalar_loop, single_exit (loop), > + &ratio_mult_vf_name, ni_name, false, > + th, check_profitability, > + cond_expr, cond_expr_stmt_list, > + 0, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); > gcc_assert (new_loop); > gcc_assert (loop_num == loop->num); > #ifdef ENABLE_CHECKING > @@ -2017,6 +2113,7 @@ vect_do_peeling_for_alignment (loop_vec_ > unsigned int th, bool check_profitability) > { > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); > tree niters_of_prolog_loop, ni_name; > tree n_iters; > tree wide_prolog_niters; > @@ -2038,11 +2135,11 @@ vect_do_peeling_for_alignment (loop_vec_ > > /* Peel the prolog loop and iterate it niters_of_prolog_loop. */ > new_loop = > - slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop), > + slpeel_tree_peel_loop_to_edge (loop, scalar_loop, > + loop_preheader_edge (loop), > &niters_of_prolog_loop, ni_name, true, > th, check_profitability, NULL_TREE, NULL, > - bound, > - 0); > + bound, 0); > > gcc_assert (new_loop); > #ifdef ENABLE_CHECKING > @@ -2398,6 +2495,7 @@ vect_loop_versioning (loop_vec_info loop > unsigned int th, bool check_profitability) > { > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); > basic_block condition_bb; > gimple_stmt_iterator gsi, cond_exp_gsi; > basic_block merge_bb; > @@ -2433,8 +2531,45 @@ vect_loop_versioning (loop_vec_info loop > gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); > > initialize_original_copy_tables (); > - loop_version (loop, cond_expr, &condition_bb, > - prob, prob, REG_BR_PROB_BASE - prob, true); > + if (scalar_loop) > + { > + edge scalar_e; > + basic_block preheader, scalar_preheader; > + > + /* We don't want to scale SCALAR_LOOP's frequencies, we need to > + scale LOOP's frequencies instead. */ > + loop_version (scalar_loop, cond_expr, &condition_bb, > + prob, REG_BR_PROB_BASE, REG_BR_PROB_BASE - prob, true); > + scale_loop_frequencies (loop, prob, REG_BR_PROB_BASE); > + /* CONDITION_BB was created above SCALAR_LOOP's preheader, > + while we need to move it above LOOP's preheader. */ > + e = loop_preheader_edge (loop); > + scalar_e = loop_preheader_edge (scalar_loop); > + gcc_assert (gimple_seq_empty_p (bb_seq (e->src)) > + && gimple_seq_empty_p (phi_nodes (e->src)) > + && single_pred_p (e->src)); > + gcc_assert (gimple_seq_empty_p (bb_seq (scalar_e->src)) > + && gimple_seq_empty_p (phi_nodes (scalar_e->src)) > + && single_pred_p (scalar_e->src)); > + gcc_assert (single_pred_p (condition_bb)); > + preheader = e->src; > + scalar_preheader = scalar_e->src; > + scalar_e = find_edge (condition_bb, scalar_preheader); > + e = single_pred_edge (preheader); > + redirect_edge_and_branch_force (single_pred_edge (condition_bb), > + scalar_preheader); > + redirect_edge_and_branch_force (scalar_e, preheader); > + redirect_edge_and_branch_force (e, condition_bb); > + set_immediate_dominator (CDI_DOMINATORS, condition_bb, > + single_pred (condition_bb)); > + set_immediate_dominator (CDI_DOMINATORS, scalar_preheader, > + single_pred (scalar_preheader)); > + set_immediate_dominator (CDI_DOMINATORS, preheader, > + condition_bb); > + } > + else > + loop_version (loop, cond_expr, &condition_bb, > + prob, prob, REG_BR_PROB_BASE - prob, true); > > if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOC > && dump_enabled_p ()) > @@ -2457,24 +2592,29 @@ vect_loop_versioning (loop_vec_info loop > basic block (i.e. it has two predecessors). Just in order to simplify > following transformations in the vectorizer, we fix this situation > here by adding a new (empty) block on the exit-edge of the loop, > - with the proper loop-exit phis to maintain loop-closed-form. */ > + with the proper loop-exit phis to maintain loop-closed-form. > + If loop versioning wasn't done from loop, but scalar_loop instead, > + merge_bb will have already just a single successor. */ > > merge_bb = single_exit (loop)->dest; > - gcc_assert (EDGE_COUNT (merge_bb->preds) == 2); > - new_exit_bb = split_edge (single_exit (loop)); > - new_exit_e = single_exit (loop); > - e = EDGE_SUCC (new_exit_bb, 0); > - > - for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi)) > + if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2) > { > - tree new_res; > - orig_phi = gsi_stmt (gsi); > - new_res = copy_ssa_name (PHI_RESULT (orig_phi), NULL); > - new_phi = create_phi_node (new_res, new_exit_bb); > - arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); > - add_phi_arg (new_phi, arg, new_exit_e, > - gimple_phi_arg_location_from_edge (orig_phi, e)); > - adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); > + gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2); > + new_exit_bb = split_edge (single_exit (loop)); > + new_exit_e = single_exit (loop); > + e = EDGE_SUCC (new_exit_bb, 0); > + > + for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi)) > + { > + tree new_res; > + orig_phi = gsi_stmt (gsi); > + new_res = copy_ssa_name (PHI_RESULT (orig_phi), NULL); > + new_phi = create_phi_node (new_res, new_exit_bb); > + arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); > + add_phi_arg (new_phi, arg, new_exit_e, > + gimple_phi_arg_location_from_edge (orig_phi, e)); > + adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); > + } > } > > /* End loop-exit-fixes after versioning. */ > --- gcc/tree-vectorizer.c.jj 2013-10-11 14:32:57.082909767 +0200 > +++ gcc/tree-vectorizer.c 2013-10-14 15:34:19.921860478 +0200 > @@ -306,6 +306,43 @@ vect_destroy_datarefs (loop_vec_info loo > } > > > +/* If LOOP has been versioned during ifcvt, return the internal call > + guarding it. */ > + > +static gimple > +vect_loop_vectorized_call (struct loop *loop) > +{ > + basic_block bb = loop_preheader_edge (loop)->src; > + gimple g; > + do > + { > + g = last_stmt (bb); > + if (g) > + break; > + if (!single_pred_p (bb)) > + break; > + bb = single_pred (bb); > + } > + while (1); > + if (g && gimple_code (g) == GIMPLE_COND) > + { > + gimple_stmt_iterator gsi = gsi_for_stmt (g); > + gsi_prev (&gsi); > + if (!gsi_end_p (gsi)) > + { > + g = gsi_stmt (gsi); > + if (is_gimple_call (g) > + && gimple_call_internal_p (g) > + && gimple_call_internal_fn (g) == IFN_LOOP_VECTORIZED > + && (tree_low_cst (gimple_call_arg (g, 0), 0) == loop->num > + || tree_low_cst (gimple_call_arg (g, 1), 0) == loop->num)) > + return g; > + } > + } > + return NULL; > +} > + > + > /* Function vectorize_loops. > > Entry point to loop vectorization phase. */ > @@ -320,6 +357,8 @@ vectorize_loops (void) > struct loop *loop; > hash_table <simduid_to_vf> simduid_to_vf_htab; > hash_table <simd_array_to_simduid> simd_array_to_simduid_htab; > + bool any_ifcvt_loops = false; > + unsigned ret = 0; > > vect_loops_num = number_of_loops (cfun); > > @@ -342,8 +381,11 @@ vectorize_loops (void) > than all previously defined loops. This fact allows us to run > only over initial loops skipping newly generated ones. */ > FOR_EACH_LOOP (li, loop, 0) > - if ((flag_tree_loop_vectorize && optimize_loop_nest_for_speed_p (loop)) > - || loop->force_vect) > + if (loop->dont_vectorize) > + any_ifcvt_loops = true; > + else if ((flag_tree_loop_vectorize > + && optimize_loop_nest_for_speed_p (loop)) > + || loop->force_vect) > { > loop_vec_info loop_vinfo; > vect_location = find_loop_location (loop); > @@ -361,6 +403,38 @@ vectorize_loops (void) > if (!dbg_cnt (vect_loop)) > break; > > + gimple loop_vectorized_call = vect_loop_vectorized_call (loop); > + if (loop_vectorized_call) > + { > + tree arg = gimple_call_arg (loop_vectorized_call, 1); > + basic_block *bbs; > + unsigned int i; > + struct loop *scalar_loop = get_loop (cfun, tree_low_cst (arg, 0)); > + > + LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = scalar_loop; > + gcc_checking_assert (vect_loop_vectorized_call > + (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) > + == loop_vectorized_call); > + bbs = get_loop_body (scalar_loop); > + for (i = 0; i < scalar_loop->num_nodes; i++) > + { > + basic_block bb = bbs[i]; > + gimple_stmt_iterator gsi; > + for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); > + gsi_next (&gsi)) > + { > + gimple phi = gsi_stmt (gsi); > + gimple_set_uid (phi, 0); > + } > + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); > + gsi_next (&gsi)) > + { > + gimple stmt = gsi_stmt (gsi); > + gimple_set_uid (stmt, 0); > + } > + } > + free (bbs); > + } > if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOC > && dump_enabled_p ()) > dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, > @@ -381,6 +455,25 @@ vectorize_loops (void) > *simduid_to_vf_htab.find_slot (simduid_to_vf_data, INSERT) > = simduid_to_vf_data; > } > + > + if (loop_vectorized_call) > + { > + gimple g = loop_vectorized_call; > + tree lhs = gimple_call_lhs (g); > + gimple_stmt_iterator gsi = gsi_for_stmt (g); > + gimplify_and_update_call_from_tree (&gsi, boolean_true_node); > + gsi_next (&gsi); > + if (!gsi_end_p (gsi)) > + { > + g = gsi_stmt (gsi); > + if (gimple_code (g) == GIMPLE_COND > + && gimple_cond_lhs (g) == lhs) > + { > + gimple_cond_set_lhs (g, boolean_true_node); > + update_stmt (g); > + } > + } > + } > } > > vect_location = UNKNOWN_LOC; > @@ -394,6 +487,34 @@ vectorize_loops (void) > > /* ----------- Finalize. ----------- */ > > + if (any_ifcvt_loops) > + for (i = 1; i < vect_loops_num; i++) > + { > + loop = get_loop (cfun, i); > + if (loop && loop->dont_vectorize) > + { > + gimple g = vect_loop_vectorized_call (loop); > + if (g) > + { > + tree lhs = gimple_call_lhs (g); > + gimple_stmt_iterator gsi = gsi_for_stmt (g); > + gimplify_and_update_call_from_tree (&gsi, boolean_false_node); > + gsi_next (&gsi); > + if (!gsi_end_p (gsi)) > + { > + g = gsi_stmt (gsi); > + if (gimple_code (g) == GIMPLE_COND > + && gimple_cond_lhs (g) == lhs) > + { > + gimple_cond_set_lhs (g, boolean_false_node); > + update_stmt (g); > + } > + } > + ret = TODO_cleanup_cfg; > + } > + } > + } > + > for (i = 1; i < vect_loops_num; i++) > { > loop_vec_info loop_vinfo; > @@ -451,7 +572,7 @@ vectorize_loops (void) > return TODO_cleanup_cfg; > } > > - return 0; > + return ret; > } > > > --- gcc/tree-vectorizer.h.jj 2013-10-11 14:32:57.086909746 +0200 > +++ gcc/tree-vectorizer.h 2013-10-14 14:32:55.538688209 +0200 > @@ -314,6 +314,10 @@ typedef struct _loop_vec_info { > fix it up. */ > bool operands_swapped; > > + /* If if-conversion versioned this loop before conversion, this is the > + loop version without if-conversion. */ > + struct loop *scalar_loop; > + > } *loop_vec_info; > > /* Access Functions. */ > @@ -345,6 +349,7 @@ typedef struct _loop_vec_info { > #define LOOP_VINFO_TARGET_COST_DATA(L) (L)->target_cost_data > #define LOOP_VINFO_PEELING_FOR_GAPS(L) (L)->peeling_for_gaps > #define LOOP_VINFO_OPERANDS_SWAPPED(L) (L)->operands_swapped > +#define LOOP_VINFO_SCALAR_LOOP(L) (L)->scalar_loop > > #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \ > (L)->may_misalign_stmts.length () > 0 > @@ -899,7 +904,8 @@ extern LOC vect_location; > in tree-vect-loop-manip.c. */ > extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree); > extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge); > -struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, edge); > +struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, > + struct loop *, edge); > extern void vect_loop_versioning (loop_vec_info, unsigned int, bool); > extern void vect_do_peeling_for_loop_bound (loop_vec_info, tree *, > unsigned int, bool); > --- gcc/cfgloop.h.jj 2013-10-11 14:32:57.089909730 +0200 > +++ gcc/cfgloop.h 2013-10-11 17:23:58.706526905 +0200 > @@ -177,6 +177,9 @@ struct GTY ((chain_next ("%h.next"))) lo > /* True if we should try harder to vectorize this loop. */ > bool force_vect; > > + /* True if this loop should never be vectorized. */ > + bool dont_vectorize; > + > /* For SIMD loops, this is a unique identifier of the loop, referenced > by IFN_GOMP_SIMD_VF, IFN_GOMP_SIMD_LANE and IFN_GOMP_SIMD_LAST_LANE > builtins. */ > --- gcc/tree-loop-distribution.c.jj 2013-10-07 15:06:40.000000000 +0200 > +++ gcc/tree-loop-distribution.c 2013-10-14 14:33:22.448549212 +0200 > @@ -673,7 +673,7 @@ copy_loop_before (struct loop *loop) > edge preheader = loop_preheader_edge (loop); > > initialize_original_copy_tables (); > - res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader); > + res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader); > gcc_assert (res != NULL); > free_original_copy_tables (); > delete_update_ssa (); > --- gcc/internal-fn.c.jj 2013-10-11 14:32:57.092909715 +0200 > +++ gcc/internal-fn.c 2013-10-11 17:23:58.706526905 +0200 > @@ -133,6 +133,14 @@ expand_GOMP_SIMD_LAST_LANE (gimple stmt > gcc_unreachable (); > } > > +/* This should get folded in tree-vectorizer.c. */ > + > +static void > +expand_LOOP_VECTORIZED (gimple stmt ATTRIBUTE_UNUSED) > +{ > + gcc_unreachable (); > +} > + > /* Routines to expand each internal function, indexed by function number. > Each routine has the prototype: > > --- gcc/tree-if-conv.c.jj 2013-10-11 14:32:57.095909699 +0200 > +++ gcc/tree-if-conv.c 2013-10-11 17:23:58.707526969 +0200 > @@ -1735,6 +1735,48 @@ combine_blocks (struct loop *loop) > ifc_bbs = NULL; > } > > +static bool > +version_loop_for_if_conversion (struct loop *loop) > +{ > + basic_block cond_bb; > + tree cond = make_ssa_name (boolean_type_node, NULL); > + struct loop *new_loop; > + gimple g; > + gimple_stmt_iterator gsi; > + void **aux = XNEWVEC (void *, loop->num_nodes); > + unsigned int i; > + > + /* We have data stored in bb->aux, but loop_version also > + uses it, so save it temporarily and restore after loop_version. */ > + for (i = 0; i < loop->num_nodes; i++) > + { > + aux[i] = ifc_bbs[i]->aux; > + ifc_bbs[i]->aux = NULL; > + } > + g = gimple_build_call_internal (IFN_LOOP_VECTORIZED, 2, > + build_int_cst (integer_type_node, loop->num), > + integer_zero_node); > + gimple_call_set_lhs (g, cond); > + > + initialize_original_copy_tables (); > + new_loop = loop_version (loop, cond, &cond_bb, > + REG_BR_PROB_BASE, REG_BR_PROB_BASE, > + REG_BR_PROB_BASE, true); > + free_original_copy_tables (); > + for (i = 0; i < loop->num_nodes; i++) > + ifc_bbs[i]->aux = aux[i]; > + XDELETEVEC (aux); > + if (new_loop == NULL) > + return false; > + new_loop->dont_vectorize = true; > + new_loop->force_vect = false; > + gsi = gsi_last_bb (cond_bb); > + gimple_call_set_arg (g, 1, build_int_cst (integer_type_node, new_loop->num)); > + gsi_insert_before (&gsi, g, GSI_SAME_STMT); > + update_ssa (TODO_update_ssa); > + return true; > +} > + > /* If-convert LOOP when it is legal. For the moment this pass has no > profitability analysis. Returns true when something changed. */ > > @@ -1744,10 +1786,18 @@ tree_if_conversion (struct loop *loop) > bool changed = false; > ifc_bbs = NULL; > > + if (loop->dont_vectorize) > + goto cleanup; > + > if (!if_convertible_loop_p (loop) > || !dbg_cnt (if_conversion_tree)) > goto cleanup; > > + if ((flag_tree_loop_vectorize || loop->force_vect) > + && flag_tree_loop_if_convert == -1 > + && !version_loop_for_if_conversion (loop)) > + goto cleanup; > + > /* Now all statements are if-convertible. Combine all the basic > blocks into one huge basic block doing the if-conversion > on-the-fly. */ > --- gcc/testsuite/gcc.dg/vect/vect-cond-11.c.jj 2013-10-15 14:01:07.877814190 +0200 > +++ gcc/testsuite/gcc.dg/vect/vect-cond-11.c 2013-10-15 14:02:29.302414970 +0200 > @@ -0,0 +1,116 @@ > +#include "tree-vect.h" > + > +#define N 1024 > +typedef int V __attribute__((vector_size (4))); > +unsigned int a[N * 2] __attribute__((aligned)); > +unsigned int b[N * 2] __attribute__((aligned)); > +V c[N]; > + > +__attribute__((noinline, noclone)) unsigned int > +foo (unsigned int *a, unsigned int *b) > +{ > + int i; > + unsigned int r = 0; > + for (i = 0; i < N; i++) > + { > + unsigned int x = a[i], y = b[i]; > + if (x < 32) > + { > + x = x + 127; > + y = y * 2; > + } > + else > + { > + x = x - 16; > + y = y + 1; > + } > + a[i] = x; > + b[i] = y; > + r += x; > + } > + return r; > +} > + > +__attribute__((noinline, noclone)) unsigned int > +bar (unsigned int *a, unsigned int *b) > +{ > + int i; > + unsigned int r = 0; > + for (i = 0; i < N; i++) > + { > + unsigned int x = a[i], y = b[i]; > + if (x < 32) > + { > + x = x + 127; > + y = y * 2; > + } > + else > + { > + x = x - 16; > + y = y + 1; > + } > + a[i] = x; > + b[i] = y; > + c[i] = c[i] + 1; > + r += x; > + } > + return r; > +} > + > +void > +baz (unsigned int *a, unsigned int *b, > + unsigned int (*fn) (unsigned int *, unsigned int *)) > +{ > + int i; > + for (i = -64; i < 0; i++) > + { > + a[i] = 19; > + b[i] = 17; > + } > + for (; i < N; i++) > + { > + a[i] = i - 512; > + b[i] = i; > + } > + for (; i < N + 64; i++) > + { > + a[i] = 27; > + b[i] = 19; > + } > + if (fn (a, b) != -512U - (N - 32) * 16U + 32 * 127U) > + __builtin_abort (); > + for (i = -64; i < 0; i++) > + if (a[i] != 19 || b[i] != 17) > + __builtin_abort (); > + for (; i < N; i++) > + if (a[i] != (i - 512U < 32U ? i - 512U + 127 : i - 512U - 16) > + || b[i] != (i - 512U < 32U ? i * 2U : i + 1U)) > + __builtin_abort (); > + for (; i < N + 64; i++) > + if (a[i] != 27 || b[i] != 19) > + __builtin_abort (); > +} > + > +int > +main () > +{ > + int i; > + check_vect (); > + baz (a + 512, b + 512, foo); > + baz (a + 512, b + 512, bar); > + baz (a + 512 + 1, b + 512 + 1, foo); > + baz (a + 512 + 1, b + 512 + 1, bar); > + baz (a + 512 + 31, b + 512 + 31, foo); > + baz (a + 512 + 31, b + 512 + 31, bar); > + baz (a + 512 + 1, b + 512, foo); > + baz (a + 512 + 1, b + 512, bar); > + baz (a + 512 + 31, b + 512, foo); > + baz (a + 512 + 31, b + 512, bar); > + baz (a + 512, b + 512 + 1, foo); > + baz (a + 512, b + 512 + 1, bar); > + baz (a + 512, b + 512 + 31, foo); > + baz (a + 512, b + 512 + 31, bar); > + return 0; > +} > + > +/* { dg-final { cleanup-tree-dump "vect" } } */ > --- gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c.jj 2013-08-30 14:38:40.000000000 +0200 > +++ gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c 2013-10-14 13:19:21.704256653 +0200 > @@ -1,4 +1,5 @@ > /* { dg-require-effective-target vect_condition } */ > +/* { dg-additional-options "-ftree-loop-if-convert" } */ > > #include "tree-vect.h" > > --- gcc/testsuite/gcc.dg/vect/bb-slp-pattern-2.c.jj 2013-08-30 14:38:40.000000000 +0200 > +++ gcc/testsuite/gcc.dg/vect/bb-slp-pattern-2.c 2013-10-14 13:19:35.678195952 +0200 > @@ -1,4 +1,5 @@ > /* { dg-require-effective-target vect_condition } */ > +/* { dg-additional-options "-ftree-loop-if-convert" } */ > > #include "tree-vect.h" > > > Jakub
On Wed, 16 Oct 2013, pinskia@gmail.com wrote: > > > On Oct 15, 2013, at 5:32 AM, Jakub Jelinek <jakub@redhat.com> wrote: > > > > Hi! > > > > Especially on i?86/x86_64 if-conversion pass seems to be often > > a pessimization, but the vectorization relies on it and without it we can't > > vectorize a lot of the loops. > > I think on many other targets it actually helps. I know for one it > helps on octeon even though octeon has no vector instructions. I think > it helps most arm targets too. The main issue is that it has no cost model - the only cost model being that it can successfully if-convert all conditional code in a loop, resulting in a single-BB loop. So it is clearly vectorization targeted. It's infrastructure may be useful to do a more sensible if-conversion on GIMPLE level on scalar code. Of course even the infrastructure needs some TLC (and some better generic machinery of keeping track and simplifying of a predicate combination). Thanks, Richard. > Thanks, > Andrew > > > > > Here is a prototype of a patch that will by default (unless explicit > > -ftree-loop-if-convert) only if-convert loops internally for vectorization, > > so the COND_EXPRs actually only appear as VEC_COND_EXPRs in the vectorized > > basic blocks, but will not appear if vectorization fails, or in the > > scalar loop if vectorization is conditional, or in the prologue or epilogue > > loops around the vectorized loop. > > > > Instead of moving the ifcvt pass inside of the vectorizer, this patch > > during ifcvt performs loop versioning depending on a special internal > > call, only if the internal call returns true we go to the if-converted > > original loop, otherwise the non-if-converted copy of the original loop > > is performed. And the vectorizer is taught to fold this internal call > > into true resp. false depending on if the loop was vectorized or not, and > > vectorizer loop versioning, peeling for alignment and for bound are adjusted > > to also copy from the non-if-converted loop rather than if-converted one. > > > > Besides fixing the various PRs where if-conversion pessimizes code I'd like > > to also move forward with this with conditional loads and stores, > > http://gcc.gnu.org/ml/gcc-patches/2012-11/msg00202.html > > where the if-unconversion approach looked like a failure. > > > > This patch doesn't yet handle if-converted inner loop in outer loop > > vectorization, something on my todo list (so several vect-cond-*.c tests > > FAIL because they are no longer vectorized) plus I had to change two > > SLP vectorization tests that silently relied on loop if-conversion being > > performed to actually optimize the basic block (if the same thing didn't > > appear in a loop, it wouldn't be optimized at all). > > > > On the newly added testcase on x86_64, there are before this patch > > 18 scalar conditional moves, with the patch just 2 (both in the checking > > routine). > > > > Comments? > > > > --- gcc/internal-fn.def.jj 2013-10-11 14:32:57.079909782 +0200 > > +++ gcc/internal-fn.def 2013-10-11 17:23:58.705526840 +0200 > > @@ -43,3 +43,4 @@ DEF_INTERNAL_FN (STORE_LANES, ECF_CONST > > DEF_INTERNAL_FN (GOMP_SIMD_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW) > > DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF | ECF_NOTHROW) > > DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW) > > +DEF_INTERNAL_FN (LOOP_VECTORIZED, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW) > > --- gcc/tree-vect-loop-manip.c.jj 2013-09-30 22:13:47.000000000 +0200 > > +++ gcc/tree-vect-loop-manip.c 2013-10-15 12:57:54.854970913 +0200 > > @@ -374,24 +374,31 @@ LOOP-> loop1 > > > > static void > > slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop, > > + struct loop *scalar_loop, > > bool is_new_loop, basic_block *new_exit_bb) > > { > > - gimple orig_phi, new_phi; > > + gimple orig_phi, new_phi, scalar_phi = NULL; > > gimple update_phi, update_phi2; > > tree guard_arg, loop_arg; > > basic_block new_merge_bb = guard_edge->dest; > > edge e = EDGE_SUCC (new_merge_bb, 0); > > basic_block update_bb = e->dest; > > basic_block orig_bb = loop->header; > > - edge new_exit_e; > > + edge new_exit_e, scalar_e = NULL; > > tree current_new_name; > > - gimple_stmt_iterator gsi_orig, gsi_update; > > + gimple_stmt_iterator gsi_orig, gsi_update, gsi_scalar = gsi_none (); > > > > /* Create new bb between loop and new_merge_bb. */ > > *new_exit_bb = split_edge (single_exit (loop)); > > > > new_exit_e = EDGE_SUCC (*new_exit_bb, 0); > > > > + if (scalar_loop != NULL && !is_new_loop) > > + { > > + gsi_scalar = gsi_start_phis (scalar_loop->header); > > + scalar_e = EDGE_SUCC (scalar_loop->latch, 0); > > + } > > + > > for (gsi_orig = gsi_start_phis (orig_bb), > > gsi_update = gsi_start_phis (update_bb); > > !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update); > > @@ -401,6 +408,11 @@ slpeel_update_phi_nodes_for_guard1 (edge > > tree new_res; > > orig_phi = gsi_stmt (gsi_orig); > > update_phi = gsi_stmt (gsi_update); > > + if (scalar_e != NULL) > > + { > > + scalar_phi = gsi_stmt (gsi_scalar); > > + gsi_next (&gsi_scalar); > > + } > > > > /** 1. Handle new-merge-point phis **/ > > > > @@ -460,7 +472,13 @@ slpeel_update_phi_nodes_for_guard1 (edge > > current_new_name = loop_arg; > > else > > { > > - current_new_name = get_current_def (loop_arg); > > + if (scalar_e) > > + { > > + current_new_name = PHI_ARG_DEF_FROM_EDGE (scalar_phi, scalar_e); > > + current_new_name = get_current_def (current_new_name); > > + } > > + else > > + current_new_name = get_current_def (loop_arg); > > /* current_def is not available only if the variable does not > > change inside the loop, in which case we also don't care > > about recording a current_def for it because we won't be > > @@ -503,6 +521,7 @@ LOOP-> loop2 > > > > static void > > slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop, > > + struct loop *scalar_loop, > > bool is_new_loop, basic_block *new_exit_bb) > > { > > gimple orig_phi, new_phi; > > @@ -511,17 +530,23 @@ slpeel_update_phi_nodes_for_guard2 (edge > > basic_block new_merge_bb = guard_edge->dest; > > edge e = EDGE_SUCC (new_merge_bb, 0); > > basic_block update_bb = e->dest; > > - edge new_exit_e; > > + edge new_exit_e, scalar_e = NULL; > > tree orig_def, orig_def_new_name; > > tree new_name, new_name2; > > tree arg; > > - gimple_stmt_iterator gsi; > > + gimple_stmt_iterator gsi, gsi_scalar = gsi_none (); > > > > /* Create new bb between loop and new_merge_bb. */ > > *new_exit_bb = split_edge (single_exit (loop)); > > > > new_exit_e = EDGE_SUCC (*new_exit_bb, 0); > > > > + if (scalar_loop != NULL) > > + { > > + scalar_e = single_exit (scalar_loop); > > + gsi_scalar = gsi_start_phis (scalar_e->dest); > > + } > > + > > for (gsi = gsi_start_phis (update_bb); !gsi_end_p (gsi); gsi_next (&gsi)) > > { > > tree new_res; > > @@ -532,7 +557,16 @@ slpeel_update_phi_nodes_for_guard2 (edge > > out of the loop - the phi arg is a constant. */ > > if (TREE_CODE (orig_def) != SSA_NAME) > > continue; > > - orig_def_new_name = get_current_def (orig_def); > > + if (scalar_loop != NULL) > > + { > > + orig_def_new_name > > + = PHI_ARG_DEF_FROM_EDGE (gsi_stmt (gsi_scalar), scalar_e); > > + gcc_assert (TREE_CODE (orig_def_new_name) == SSA_NAME); > > + orig_def_new_name = get_current_def (orig_def_new_name); > > + gsi_next (&gsi_scalar); > > + } > > + else > > + orig_def_new_name = get_current_def (orig_def); > > arg = NULL_TREE; > > > > /** 1. Handle new-merge-point phis **/ > > @@ -693,7 +727,8 @@ slpeel_make_loop_iterate_ntimes (struct > > on E which is either the entry or exit of LOOP. */ > > > > struct loop * > > -slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e) > > +slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, > > + struct loop *scalar_loop, edge e) > > { > > struct loop *new_loop; > > basic_block *new_bbs, *bbs; > > @@ -707,19 +742,22 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( > > if (!at_exit && e != loop_preheader_edge (loop)) > > return NULL; > > > > - bbs = XNEWVEC (basic_block, loop->num_nodes + 1); > > - get_loop_body_with_size (loop, bbs, loop->num_nodes); > > + if (scalar_loop == NULL) > > + scalar_loop = loop; > > + > > + bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); > > + get_loop_body_with_size (scalar_loop, bbs, scalar_loop->num_nodes); > > > > /* Check whether duplication is possible. */ > > - if (!can_copy_bbs_p (bbs, loop->num_nodes)) > > + if (!can_copy_bbs_p (bbs, scalar_loop->num_nodes)) > > { > > free (bbs); > > return NULL; > > } > > > > /* Generate new loop structure. */ > > - new_loop = duplicate_loop (loop, loop_outer (loop)); > > - duplicate_subloops (loop, new_loop); > > + new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop)); > > + duplicate_subloops (scalar_loop, new_loop); > > > > exit_dest = exit->dest; > > was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS, > > @@ -729,35 +767,66 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( > > /* Also copy the pre-header, this avoids jumping through hoops to > > duplicate the loop entry PHI arguments. Create an empty > > pre-header unconditionally for this. */ > > - basic_block preheader = split_edge (loop_preheader_edge (loop)); > > + basic_block preheader = split_edge (loop_preheader_edge (scalar_loop)); > > edge entry_e = single_pred_edge (preheader); > > - bbs[loop->num_nodes] = preheader; > > - new_bbs = XNEWVEC (basic_block, loop->num_nodes + 1); > > + bbs[scalar_loop->num_nodes] = preheader; > > + new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); > > > > - copy_bbs (bbs, loop->num_nodes + 1, new_bbs, > > + exit = single_exit (scalar_loop); > > + copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs, > > &exit, 1, &new_exit, NULL, > > e->src, true); > > - basic_block new_preheader = new_bbs[loop->num_nodes]; > > + exit = single_exit (loop); > > + basic_block new_preheader = new_bbs[scalar_loop->num_nodes]; > > > > - add_phi_args_after_copy (new_bbs, loop->num_nodes + 1, NULL); > > + add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL); > > > > if (at_exit) /* Add the loop copy at exit. */ > > { > > + if (scalar_loop != loop) > > + { > > + gimple_stmt_iterator gsi; > > + new_exit = redirect_edge_and_branch (new_exit, exit_dest); > > + > > + for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi); > > + gsi_next (&gsi)) > > + { > > + gimple phi = gsi_stmt (gsi); > > + tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e); > > + location_t orig_locus > > + = gimple_phi_arg_location_from_edge (phi, e); > > + > > + add_phi_arg (phi, orig_arg, new_exit, orig_locus); > > + } > > + } > > redirect_edge_and_branch_force (e, new_preheader); > > flush_pending_stmts (e); > > set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src); > > if (was_imm_dom) > > - set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header); > > + set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src); > > > > /* And remove the non-necessary forwarder again. Keep the other > > one so we have a proper pre-header for the loop at the exit edge. */ > > - redirect_edge_pred (single_succ_edge (preheader), single_pred (preheader)); > > + redirect_edge_pred (single_succ_edge (preheader), > > + single_pred (preheader)); > > delete_basic_block (preheader); > > - set_immediate_dominator (CDI_DOMINATORS, loop->header, > > - loop_preheader_edge (loop)->src); > > + set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, > > + loop_preheader_edge (scalar_loop)->src); > > } > > else /* Add the copy at entry. */ > > { > > + if (scalar_loop != loop) > > + { > > + /* Remove the non-necessary forwarder of scalar_loop again. */ > > + redirect_edge_pred (single_succ_edge (preheader), > > + single_pred (preheader)); > > + delete_basic_block (preheader); > > + set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, > > + loop_preheader_edge (scalar_loop)->src); > > + preheader = split_edge (loop_preheader_edge (loop)); > > + entry_e = single_pred_edge (preheader); > > + } > > + > > redirect_edge_and_branch_force (entry_e, new_preheader); > > flush_pending_stmts (entry_e); > > set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src); > > @@ -768,15 +837,39 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( > > > > /* And remove the non-necessary forwarder again. Keep the other > > one so we have a proper pre-header for the loop at the exit edge. */ > > - redirect_edge_pred (single_succ_edge (new_preheader), single_pred (new_preheader)); > > + redirect_edge_pred (single_succ_edge (new_preheader), > > + single_pred (new_preheader)); > > delete_basic_block (new_preheader); > > set_immediate_dominator (CDI_DOMINATORS, new_loop->header, > > loop_preheader_edge (new_loop)->src); > > } > > > > - for (unsigned i = 0; i < loop->num_nodes+1; i++) > > + for (unsigned i = 0; i < scalar_loop->num_nodes + 1; i++) > > rename_variables_in_bb (new_bbs[i]); > > > > + if (scalar_loop != loop) > > + { > > + /* Update new_loop->header PHIs, so that on the preheader > > + edge they are the ones from loop rather than scalar_loop. */ > > + gimple_stmt_iterator gsi_orig, gsi_new; > > + edge orig_e = loop_preheader_edge (loop); > > + edge new_e = loop_preheader_edge (new_loop); > > + > > + for (gsi_orig = gsi_start_phis (loop->header), > > + gsi_new = gsi_start_phis (new_loop->header); > > + !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new); > > + gsi_next (&gsi_orig), gsi_next (&gsi_new)) > > + { > > + gimple orig_phi = gsi_stmt (gsi_orig); > > + gimple new_phi = gsi_stmt (gsi_new); > > + tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e); > > + location_t orig_locus > > + = gimple_phi_arg_location_from_edge (orig_phi, orig_e); > > + > > + add_phi_arg (new_phi, orig_arg, new_e, orig_locus); > > + } > > + } > > + > > free (new_bbs); > > free (bbs); > > > > @@ -1028,8 +1121,8 @@ set_prologue_iterations (basic_block bb_ > > FORNOW the resulting code will not be in loop-closed-ssa form. > > */ > > > > -static struct loop* > > -slpeel_tree_peel_loop_to_edge (struct loop *loop, > > +static struct loop * > > +slpeel_tree_peel_loop_to_edge (struct loop *loop, struct loop *scalar_loop, > > edge e, tree *first_niters, > > tree niters, bool update_first_loop_count, > > unsigned int th, bool check_profitability, > > @@ -1114,7 +1207,8 @@ slpeel_tree_peel_loop_to_edge (struct lo > > orig_exit_bb: > > */ > > > > - if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e))) > > + if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, > > + e))) > > { > > loop_loc = find_loop_location (loop); > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, > > @@ -1291,7 +1385,7 @@ slpeel_tree_peel_loop_to_edge (struct lo > > inverse_probability (first_guard_probability)); > > scale_loop_profile (first_loop, first_guard_probability, > > check_profitability && (int)th > bound1 ? th : bound1); > > - slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop, > > + slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop, scalar_loop, > > first_loop == new_loop, > > &new_exit_bb); > > > > @@ -1331,7 +1425,7 @@ slpeel_tree_peel_loop_to_edge (struct lo > > bb_after_second_loop, bb_before_first_loop, > > inverse_probability (second_guard_probability)); > > scale_loop_profile (second_loop, probability_of_second_loop, bound2); > > - slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop, > > + slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop, scalar_loop, > > second_loop == new_loop, &new_exit_bb); > > > > /* 4. Make first-loop iterate FIRST_NITERS times, if requested. > > @@ -1755,6 +1849,7 @@ vect_do_peeling_for_loop_bound (loop_vec > > { > > tree ni_name, ratio_mult_vf_name; > > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > > + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); > > struct loop *new_loop; > > edge update_e; > > basic_block preheader; > > @@ -1780,11 +1875,12 @@ vect_do_peeling_for_loop_bound (loop_vec > > > > loop_num = loop->num; > > > > - new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop), > > - &ratio_mult_vf_name, ni_name, false, > > - th, check_profitability, > > - cond_expr, cond_expr_stmt_list, > > - 0, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); > > + new_loop > > + = slpeel_tree_peel_loop_to_edge (loop, scalar_loop, single_exit (loop), > > + &ratio_mult_vf_name, ni_name, false, > > + th, check_profitability, > > + cond_expr, cond_expr_stmt_list, > > + 0, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); > > gcc_assert (new_loop); > > gcc_assert (loop_num == loop->num); > > #ifdef ENABLE_CHECKING > > @@ -2017,6 +2113,7 @@ vect_do_peeling_for_alignment (loop_vec_ > > unsigned int th, bool check_profitability) > > { > > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > > + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); > > tree niters_of_prolog_loop, ni_name; > > tree n_iters; > > tree wide_prolog_niters; > > @@ -2038,11 +2135,11 @@ vect_do_peeling_for_alignment (loop_vec_ > > > > /* Peel the prolog loop and iterate it niters_of_prolog_loop. */ > > new_loop = > > - slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop), > > + slpeel_tree_peel_loop_to_edge (loop, scalar_loop, > > + loop_preheader_edge (loop), > > &niters_of_prolog_loop, ni_name, true, > > th, check_profitability, NULL_TREE, NULL, > > - bound, > > - 0); > > + bound, 0); > > > > gcc_assert (new_loop); > > #ifdef ENABLE_CHECKING > > @@ -2398,6 +2495,7 @@ vect_loop_versioning (loop_vec_info loop > > unsigned int th, bool check_profitability) > > { > > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > > + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); > > basic_block condition_bb; > > gimple_stmt_iterator gsi, cond_exp_gsi; > > basic_block merge_bb; > > @@ -2433,8 +2531,45 @@ vect_loop_versioning (loop_vec_info loop > > gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); > > > > initialize_original_copy_tables (); > > - loop_version (loop, cond_expr, &condition_bb, > > - prob, prob, REG_BR_PROB_BASE - prob, true); > > + if (scalar_loop) > > + { > > + edge scalar_e; > > + basic_block preheader, scalar_preheader; > > + > > + /* We don't want to scale SCALAR_LOOP's frequencies, we need to > > + scale LOOP's frequencies instead. */ > > + loop_version (scalar_loop, cond_expr, &condition_bb, > > + prob, REG_BR_PROB_BASE, REG_BR_PROB_BASE - prob, true); > > + scale_loop_frequencies (loop, prob, REG_BR_PROB_BASE); > > + /* CONDITION_BB was created above SCALAR_LOOP's preheader, > > + while we need to move it above LOOP's preheader. */ > > + e = loop_preheader_edge (loop); > > + scalar_e = loop_preheader_edge (scalar_loop); > > + gcc_assert (gimple_seq_empty_p (bb_seq (e->src)) > > + && gimple_seq_empty_p (phi_nodes (e->src)) > > + && single_pred_p (e->src)); > > + gcc_assert (gimple_seq_empty_p (bb_seq (scalar_e->src)) > > + && gimple_seq_empty_p (phi_nodes (scalar_e->src)) > > + && single_pred_p (scalar_e->src)); > > + gcc_assert (single_pred_p (condition_bb)); > > + preheader = e->src; > > + scalar_preheader = scalar_e->src; > > + scalar_e = find_edge (condition_bb, scalar_preheader); > > + e = single_pred_edge (preheader); > > + redirect_edge_and_branch_force (single_pred_edge (condition_bb), > > + scalar_preheader); > > + redirect_edge_and_branch_force (scalar_e, preheader); > > + redirect_edge_and_branch_force (e, condition_bb); > > + set_immediate_dominator (CDI_DOMINATORS, condition_bb, > > + single_pred (condition_bb)); > > + set_immediate_dominator (CDI_DOMINATORS, scalar_preheader, > > + single_pred (scalar_preheader)); > > + set_immediate_dominator (CDI_DOMINATORS, preheader, > > + condition_bb); > > + } > > + else > > + loop_version (loop, cond_expr, &condition_bb, > > + prob, prob, REG_BR_PROB_BASE - prob, true); > > > > if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOC > > && dump_enabled_p ()) > > @@ -2457,24 +2592,29 @@ vect_loop_versioning (loop_vec_info loop > > basic block (i.e. it has two predecessors). Just in order to simplify > > following transformations in the vectorizer, we fix this situation > > here by adding a new (empty) block on the exit-edge of the loop, > > - with the proper loop-exit phis to maintain loop-closed-form. */ > > + with the proper loop-exit phis to maintain loop-closed-form. > > + If loop versioning wasn't done from loop, but scalar_loop instead, > > + merge_bb will have already just a single successor. */ > > > > merge_bb = single_exit (loop)->dest; > > - gcc_assert (EDGE_COUNT (merge_bb->preds) == 2); > > - new_exit_bb = split_edge (single_exit (loop)); > > - new_exit_e = single_exit (loop); > > - e = EDGE_SUCC (new_exit_bb, 0); > > - > > - for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi)) > > + if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2) > > { > > - tree new_res; > > - orig_phi = gsi_stmt (gsi); > > - new_res = copy_ssa_name (PHI_RESULT (orig_phi), NULL); > > - new_phi = create_phi_node (new_res, new_exit_bb); > > - arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); > > - add_phi_arg (new_phi, arg, new_exit_e, > > - gimple_phi_arg_location_from_edge (orig_phi, e)); > > - adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); > > + gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2); > > + new_exit_bb = split_edge (single_exit (loop)); > > + new_exit_e = single_exit (loop); > > + e = EDGE_SUCC (new_exit_bb, 0); > > + > > + for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi)) > > + { > > + tree new_res; > > + orig_phi = gsi_stmt (gsi); > > + new_res = copy_ssa_name (PHI_RESULT (orig_phi), NULL); > > + new_phi = create_phi_node (new_res, new_exit_bb); > > + arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); > > + add_phi_arg (new_phi, arg, new_exit_e, > > + gimple_phi_arg_location_from_edge (orig_phi, e)); > > + adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); > > + } > > } > > > > /* End loop-exit-fixes after versioning. */ > > --- gcc/tree-vectorizer.c.jj 2013-10-11 14:32:57.082909767 +0200 > > +++ gcc/tree-vectorizer.c 2013-10-14 15:34:19.921860478 +0200 > > @@ -306,6 +306,43 @@ vect_destroy_datarefs (loop_vec_info loo > > } > > > > > > +/* If LOOP has been versioned during ifcvt, return the internal call > > + guarding it. */ > > + > > +static gimple > > +vect_loop_vectorized_call (struct loop *loop) > > +{ > > + basic_block bb = loop_preheader_edge (loop)->src; > > + gimple g; > > + do > > + { > > + g = last_stmt (bb); > > + if (g) > > + break; > > + if (!single_pred_p (bb)) > > + break; > > + bb = single_pred (bb); > > + } > > + while (1); > > + if (g && gimple_code (g) == GIMPLE_COND) > > + { > > + gimple_stmt_iterator gsi = gsi_for_stmt (g); > > + gsi_prev (&gsi); > > + if (!gsi_end_p (gsi)) > > + { > > + g = gsi_stmt (gsi); > > + if (is_gimple_call (g) > > + && gimple_call_internal_p (g) > > + && gimple_call_internal_fn (g) == IFN_LOOP_VECTORIZED > > + && (tree_low_cst (gimple_call_arg (g, 0), 0) == loop->num > > + || tree_low_cst (gimple_call_arg (g, 1), 0) == loop->num)) > > + return g; > > + } > > + } > > + return NULL; > > +} > > + > > + > > /* Function vectorize_loops. > > > > Entry point to loop vectorization phase. */ > > @@ -320,6 +357,8 @@ vectorize_loops (void) > > struct loop *loop; > > hash_table <simduid_to_vf> simduid_to_vf_htab; > > hash_table <simd_array_to_simduid> simd_array_to_simduid_htab; > > + bool any_ifcvt_loops = false; > > + unsigned ret = 0; > > > > vect_loops_num = number_of_loops (cfun); > > > > @@ -342,8 +381,11 @@ vectorize_loops (void) > > than all previously defined loops. This fact allows us to run > > only over initial loops skipping newly generated ones. */ > > FOR_EACH_LOOP (li, loop, 0) > > - if ((flag_tree_loop_vectorize && optimize_loop_nest_for_speed_p (loop)) > > - || loop->force_vect) > > + if (loop->dont_vectorize) > > + any_ifcvt_loops = true; > > + else if ((flag_tree_loop_vectorize > > + && optimize_loop_nest_for_speed_p (loop)) > > + || loop->force_vect) > > { > > loop_vec_info loop_vinfo; > > vect_location = find_loop_location (loop); > > @@ -361,6 +403,38 @@ vectorize_loops (void) > > if (!dbg_cnt (vect_loop)) > > break; > > > > + gimple loop_vectorized_call = vect_loop_vectorized_call (loop); > > + if (loop_vectorized_call) > > + { > > + tree arg = gimple_call_arg (loop_vectorized_call, 1); > > + basic_block *bbs; > > + unsigned int i; > > + struct loop *scalar_loop = get_loop (cfun, tree_low_cst (arg, 0)); > > + > > + LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = scalar_loop; > > + gcc_checking_assert (vect_loop_vectorized_call > > + (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) > > + == loop_vectorized_call); > > + bbs = get_loop_body (scalar_loop); > > + for (i = 0; i < scalar_loop->num_nodes; i++) > > + { > > + basic_block bb = bbs[i]; > > + gimple_stmt_iterator gsi; > > + for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); > > + gsi_next (&gsi)) > > + { > > + gimple phi = gsi_stmt (gsi); > > + gimple_set_uid (phi, 0); > > + } > > + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); > > + gsi_next (&gsi)) > > + { > > + gimple stmt = gsi_stmt (gsi); > > + gimple_set_uid (stmt, 0); > > + } > > + } > > + free (bbs); > > + } > > if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOC > > && dump_enabled_p ()) > > dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, > > @@ -381,6 +455,25 @@ vectorize_loops (void) > > *simduid_to_vf_htab.find_slot (simduid_to_vf_data, INSERT) > > = simduid_to_vf_data; > > } > > + > > + if (loop_vectorized_call) > > + { > > + gimple g = loop_vectorized_call; > > + tree lhs = gimple_call_lhs (g); > > + gimple_stmt_iterator gsi = gsi_for_stmt (g); > > + gimplify_and_update_call_from_tree (&gsi, boolean_true_node); > > + gsi_next (&gsi); > > + if (!gsi_end_p (gsi)) > > + { > > + g = gsi_stmt (gsi); > > + if (gimple_code (g) == GIMPLE_COND > > + && gimple_cond_lhs (g) == lhs) > > + { > > + gimple_cond_set_lhs (g, boolean_true_node); > > + update_stmt (g); > > + } > > + } > > + } > > } > > > > vect_location = UNKNOWN_LOC; > > @@ -394,6 +487,34 @@ vectorize_loops (void) > > > > /* ----------- Finalize. ----------- */ > > > > + if (any_ifcvt_loops) > > + for (i = 1; i < vect_loops_num; i++) > > + { > > + loop = get_loop (cfun, i); > > + if (loop && loop->dont_vectorize) > > + { > > + gimple g = vect_loop_vectorized_call (loop); > > + if (g) > > + { > > + tree lhs = gimple_call_lhs (g); > > + gimple_stmt_iterator gsi = gsi_for_stmt (g); > > + gimplify_and_update_call_from_tree (&gsi, boolean_false_node); > > + gsi_next (&gsi); > > + if (!gsi_end_p (gsi)) > > + { > > + g = gsi_stmt (gsi); > > + if (gimple_code (g) == GIMPLE_COND > > + && gimple_cond_lhs (g) == lhs) > > + { > > + gimple_cond_set_lhs (g, boolean_false_node); > > + update_stmt (g); > > + } > > + } > > + ret = TODO_cleanup_cfg; > > + } > > + } > > + } > > + > > for (i = 1; i < vect_loops_num; i++) > > { > > loop_vec_info loop_vinfo; > > @@ -451,7 +572,7 @@ vectorize_loops (void) > > return TODO_cleanup_cfg; > > } > > > > - return 0; > > + return ret; > > } > > > > > > --- gcc/tree-vectorizer.h.jj 2013-10-11 14:32:57.086909746 +0200 > > +++ gcc/tree-vectorizer.h 2013-10-14 14:32:55.538688209 +0200 > > @@ -314,6 +314,10 @@ typedef struct _loop_vec_info { > > fix it up. */ > > bool operands_swapped; > > > > + /* If if-conversion versioned this loop before conversion, this is the > > + loop version without if-conversion. */ > > + struct loop *scalar_loop; > > + > > } *loop_vec_info; > > > > /* Access Functions. */ > > @@ -345,6 +349,7 @@ typedef struct _loop_vec_info { > > #define LOOP_VINFO_TARGET_COST_DATA(L) (L)->target_cost_data > > #define LOOP_VINFO_PEELING_FOR_GAPS(L) (L)->peeling_for_gaps > > #define LOOP_VINFO_OPERANDS_SWAPPED(L) (L)->operands_swapped > > +#define LOOP_VINFO_SCALAR_LOOP(L) (L)->scalar_loop > > > > #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \ > > (L)->may_misalign_stmts.length () > 0 > > @@ -899,7 +904,8 @@ extern LOC vect_location; > > in tree-vect-loop-manip.c. */ > > extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree); > > extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge); > > -struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, edge); > > +struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, > > + struct loop *, edge); > > extern void vect_loop_versioning (loop_vec_info, unsigned int, bool); > > extern void vect_do_peeling_for_loop_bound (loop_vec_info, tree *, > > unsigned int, bool); > > --- gcc/cfgloop.h.jj 2013-10-11 14:32:57.089909730 +0200 > > +++ gcc/cfgloop.h 2013-10-11 17:23:58.706526905 +0200 > > @@ -177,6 +177,9 @@ struct GTY ((chain_next ("%h.next"))) lo > > /* True if we should try harder to vectorize this loop. */ > > bool force_vect; > > > > + /* True if this loop should never be vectorized. */ > > + bool dont_vectorize; > > + > > /* For SIMD loops, this is a unique identifier of the loop, referenced > > by IFN_GOMP_SIMD_VF, IFN_GOMP_SIMD_LANE and IFN_GOMP_SIMD_LAST_LANE > > builtins. */ > > --- gcc/tree-loop-distribution.c.jj 2013-10-07 15:06:40.000000000 +0200 > > +++ gcc/tree-loop-distribution.c 2013-10-14 14:33:22.448549212 +0200 > > @@ -673,7 +673,7 @@ copy_loop_before (struct loop *loop) > > edge preheader = loop_preheader_edge (loop); > > > > initialize_original_copy_tables (); > > - res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader); > > + res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader); > > gcc_assert (res != NULL); > > free_original_copy_tables (); > > delete_update_ssa (); > > --- gcc/internal-fn.c.jj 2013-10-11 14:32:57.092909715 +0200 > > +++ gcc/internal-fn.c 2013-10-11 17:23:58.706526905 +0200 > > @@ -133,6 +133,14 @@ expand_GOMP_SIMD_LAST_LANE (gimple stmt > > gcc_unreachable (); > > } > > > > +/* This should get folded in tree-vectorizer.c. */ > > + > > +static void > > +expand_LOOP_VECTORIZED (gimple stmt ATTRIBUTE_UNUSED) > > +{ > > + gcc_unreachable (); > > +} > > + > > /* Routines to expand each internal function, indexed by function number. > > Each routine has the prototype: > > > > --- gcc/tree-if-conv.c.jj 2013-10-11 14:32:57.095909699 +0200 > > +++ gcc/tree-if-conv.c 2013-10-11 17:23:58.707526969 +0200 > > @@ -1735,6 +1735,48 @@ combine_blocks (struct loop *loop) > > ifc_bbs = NULL; > > } > > > > +static bool > > +version_loop_for_if_conversion (struct loop *loop) > > +{ > > + basic_block cond_bb; > > + tree cond = make_ssa_name (boolean_type_node, NULL); > > + struct loop *new_loop; > > + gimple g; > > + gimple_stmt_iterator gsi; > > + void **aux = XNEWVEC (void *, loop->num_nodes); > > + unsigned int i; > > + > > + /* We have data stored in bb->aux, but loop_version also > > + uses it, so save it temporarily and restore after loop_version. */ > > + for (i = 0; i < loop->num_nodes; i++) > > + { > > + aux[i] = ifc_bbs[i]->aux; > > + ifc_bbs[i]->aux = NULL; > > + } > > + g = gimple_build_call_internal (IFN_LOOP_VECTORIZED, 2, > > + build_int_cst (integer_type_node, loop->num), > > + integer_zero_node); > > + gimple_call_set_lhs (g, cond); > > + > > + initialize_original_copy_tables (); > > + new_loop = loop_version (loop, cond, &cond_bb, > > + REG_BR_PROB_BASE, REG_BR_PROB_BASE, > > + REG_BR_PROB_BASE, true); > > + free_original_copy_tables (); > > + for (i = 0; i < loop->num_nodes; i++) > > + ifc_bbs[i]->aux = aux[i]; > > + XDELETEVEC (aux); > > + if (new_loop == NULL) > > + return false; > > + new_loop->dont_vectorize = true; > > + new_loop->force_vect = false; > > + gsi = gsi_last_bb (cond_bb); > > + gimple_call_set_arg (g, 1, build_int_cst (integer_type_node, new_loop->num)); > > + gsi_insert_before (&gsi, g, GSI_SAME_STMT); > > + update_ssa (TODO_update_ssa); > > + return true; > > +} > > + > > /* If-convert LOOP when it is legal. For the moment this pass has no > > profitability analysis. Returns true when something changed. */ > > > > @@ -1744,10 +1786,18 @@ tree_if_conversion (struct loop *loop) > > bool changed = false; > > ifc_bbs = NULL; > > > > + if (loop->dont_vectorize) > > + goto cleanup; > > + > > if (!if_convertible_loop_p (loop) > > || !dbg_cnt (if_conversion_tree)) > > goto cleanup; > > > > + if ((flag_tree_loop_vectorize || loop->force_vect) > > + && flag_tree_loop_if_convert == -1 > > + && !version_loop_for_if_conversion (loop)) > > + goto cleanup; > > + > > /* Now all statements are if-convertible. Combine all the basic > > blocks into one huge basic block doing the if-conversion > > on-the-fly. */ > > --- gcc/testsuite/gcc.dg/vect/vect-cond-11.c.jj 2013-10-15 14:01:07.877814190 +0200 > > +++ gcc/testsuite/gcc.dg/vect/vect-cond-11.c 2013-10-15 14:02:29.302414970 +0200 > > @@ -0,0 +1,116 @@ > > +#include "tree-vect.h" > > + > > +#define N 1024 > > +typedef int V __attribute__((vector_size (4))); > > +unsigned int a[N * 2] __attribute__((aligned)); > > +unsigned int b[N * 2] __attribute__((aligned)); > > +V c[N]; > > + > > +__attribute__((noinline, noclone)) unsigned int > > +foo (unsigned int *a, unsigned int *b) > > +{ > > + int i; > > + unsigned int r = 0; > > + for (i = 0; i < N; i++) > > + { > > + unsigned int x = a[i], y = b[i]; > > + if (x < 32) > > + { > > + x = x + 127; > > + y = y * 2; > > + } > > + else > > + { > > + x = x - 16; > > + y = y + 1; > > + } > > + a[i] = x; > > + b[i] = y; > > + r += x; > > + } > > + return r; > > +} > > + > > +__attribute__((noinline, noclone)) unsigned int > > +bar (unsigned int *a, unsigned int *b) > > +{ > > + int i; > > + unsigned int r = 0; > > + for (i = 0; i < N; i++) > > + { > > + unsigned int x = a[i], y = b[i]; > > + if (x < 32) > > + { > > + x = x + 127; > > + y = y * 2; > > + } > > + else > > + { > > + x = x - 16; > > + y = y + 1; > > + } > > + a[i] = x; > > + b[i] = y; > > + c[i] = c[i] + 1; > > + r += x; > > + } > > + return r; > > +} > > + > > +void > > +baz (unsigned int *a, unsigned int *b, > > + unsigned int (*fn) (unsigned int *, unsigned int *)) > > +{ > > + int i; > > + for (i = -64; i < 0; i++) > > + { > > + a[i] = 19; > > + b[i] = 17; > > + } > > + for (; i < N; i++) > > + { > > + a[i] = i - 512; > > + b[i] = i; > > + } > > + for (; i < N + 64; i++) > > + { > > + a[i] = 27; > > + b[i] = 19; > > + } > > + if (fn (a, b) != -512U - (N - 32) * 16U + 32 * 127U) > > + __builtin_abort (); > > + for (i = -64; i < 0; i++) > > + if (a[i] != 19 || b[i] != 17) > > + __builtin_abort (); > > + for (; i < N; i++) > > + if (a[i] != (i - 512U < 32U ? i - 512U + 127 : i - 512U - 16) > > + || b[i] != (i - 512U < 32U ? i * 2U : i + 1U)) > > + __builtin_abort (); > > + for (; i < N + 64; i++) > > + if (a[i] != 27 || b[i] != 19) > > + __builtin_abort (); > > +} > > + > > +int > > +main () > > +{ > > + int i; > > + check_vect (); > > + baz (a + 512, b + 512, foo); > > + baz (a + 512, b + 512, bar); > > + baz (a + 512 + 1, b + 512 + 1, foo); > > + baz (a + 512 + 1, b + 512 + 1, bar); > > + baz (a + 512 + 31, b + 512 + 31, foo); > > + baz (a + 512 + 31, b + 512 + 31, bar); > > + baz (a + 512 + 1, b + 512, foo); > > + baz (a + 512 + 1, b + 512, bar); > > + baz (a + 512 + 31, b + 512, foo); > > + baz (a + 512 + 31, b + 512, bar); > > + baz (a + 512, b + 512 + 1, foo); > > + baz (a + 512, b + 512 + 1, bar); > > + baz (a + 512, b + 512 + 31, foo); > > + baz (a + 512, b + 512 + 31, bar); > > + return 0; > > +} > > + > > +/* { dg-final { cleanup-tree-dump "vect" } } */ > > --- gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c.jj 2013-08-30 14:38:40.000000000 +0200 > > +++ gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c 2013-10-14 13:19:21.704256653 +0200 > > @@ -1,4 +1,5 @@ > > /* { dg-require-effective-target vect_condition } */ > > +/* { dg-additional-options "-ftree-loop-if-convert" } */ > > > > #include "tree-vect.h" > > > > --- gcc/testsuite/gcc.dg/vect/bb-slp-pattern-2.c.jj 2013-08-30 14:38:40.000000000 +0200 > > +++ gcc/testsuite/gcc.dg/vect/bb-slp-pattern-2.c 2013-10-14 13:19:35.678195952 +0200 > > @@ -1,4 +1,5 @@ > > /* { dg-require-effective-target vect_condition } */ > > +/* { dg-additional-options "-ftree-loop-if-convert" } */ > > > > #include "tree-vect.h" > > > > > > Jakub > >
On Thu, Oct 17, 2013 at 11:26:56AM +0200, Richard Biener wrote: > On Wed, 16 Oct 2013, pinskia@gmail.com wrote: > > > > > > On Oct 15, 2013, at 5:32 AM, Jakub Jelinek <jakub@redhat.com> wrote: > > > Especially on i?86/x86_64 if-conversion pass seems to be often > > > a pessimization, but the vectorization relies on it and without it we can't > > > vectorize a lot of the loops. > > > > I think on many other targets it actually helps. I know for one it > > helps on octeon even though octeon has no vector instructions. I think > > it helps most arm targets too. > > The main issue is that it has no cost model - the only cost model > being that it can successfully if-convert all conditional code in > a loop, resulting in a single-BB loop. So it is clearly vectorization > targeted. > > It's infrastructure may be useful to do a more sensible if-conversion > on GIMPLE level on scalar code. > > Of course even the infrastructure needs some TLC (and some better > generic machinery of keeping track and simplifying of a predicate > combination). Yeah, or, if tree if-conversion is a net win for some port even when not vectorizing, supposedly such a port should at least until the above is implemented just enable flag_tree_loop_if_convert by default, at least at some -O* levels. Of course the question is why would it be beneficial only in inner loops and not elsewhere (other loops, or stright line code), and when it is desirable and when not (for vectorization we of course try hard to if-convert the whole loop, because otherwise we are not able to vectorize it, but otherwise, is it beneficial just for a couple of conditionalized stmts at most, or is it fine to conditionalize say 1000 arithmetic statements?). Jakub
--- gcc/internal-fn.def.jj 2013-10-11 14:32:57.079909782 +0200 +++ gcc/internal-fn.def 2013-10-11 17:23:58.705526840 +0200 @@ -43,3 +43,4 @@ DEF_INTERNAL_FN (STORE_LANES, ECF_CONST DEF_INTERNAL_FN (GOMP_SIMD_LANE, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW) DEF_INTERNAL_FN (GOMP_SIMD_VF, ECF_CONST | ECF_LEAF | ECF_NOTHROW) DEF_INTERNAL_FN (GOMP_SIMD_LAST_LANE, ECF_CONST | ECF_LEAF | ECF_NOTHROW) +DEF_INTERNAL_FN (LOOP_VECTORIZED, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW) --- gcc/tree-vect-loop-manip.c.jj 2013-09-30 22:13:47.000000000 +0200 +++ gcc/tree-vect-loop-manip.c 2013-10-15 12:57:54.854970913 +0200 @@ -374,24 +374,31 @@ LOOP-> loop1 static void slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop, + struct loop *scalar_loop, bool is_new_loop, basic_block *new_exit_bb) { - gimple orig_phi, new_phi; + gimple orig_phi, new_phi, scalar_phi = NULL; gimple update_phi, update_phi2; tree guard_arg, loop_arg; basic_block new_merge_bb = guard_edge->dest; edge e = EDGE_SUCC (new_merge_bb, 0); basic_block update_bb = e->dest; basic_block orig_bb = loop->header; - edge new_exit_e; + edge new_exit_e, scalar_e = NULL; tree current_new_name; - gimple_stmt_iterator gsi_orig, gsi_update; + gimple_stmt_iterator gsi_orig, gsi_update, gsi_scalar = gsi_none (); /* Create new bb between loop and new_merge_bb. */ *new_exit_bb = split_edge (single_exit (loop)); new_exit_e = EDGE_SUCC (*new_exit_bb, 0); + if (scalar_loop != NULL && !is_new_loop) + { + gsi_scalar = gsi_start_phis (scalar_loop->header); + scalar_e = EDGE_SUCC (scalar_loop->latch, 0); + } + for (gsi_orig = gsi_start_phis (orig_bb), gsi_update = gsi_start_phis (update_bb); !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update); @@ -401,6 +408,11 @@ slpeel_update_phi_nodes_for_guard1 (edge tree new_res; orig_phi = gsi_stmt (gsi_orig); update_phi = gsi_stmt (gsi_update); + if (scalar_e != NULL) + { + scalar_phi = gsi_stmt (gsi_scalar); + gsi_next (&gsi_scalar); + } /** 1. Handle new-merge-point phis **/ @@ -460,7 +472,13 @@ slpeel_update_phi_nodes_for_guard1 (edge current_new_name = loop_arg; else { - current_new_name = get_current_def (loop_arg); + if (scalar_e) + { + current_new_name = PHI_ARG_DEF_FROM_EDGE (scalar_phi, scalar_e); + current_new_name = get_current_def (current_new_name); + } + else + current_new_name = get_current_def (loop_arg); /* current_def is not available only if the variable does not change inside the loop, in which case we also don't care about recording a current_def for it because we won't be @@ -503,6 +521,7 @@ LOOP-> loop2 static void slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop, + struct loop *scalar_loop, bool is_new_loop, basic_block *new_exit_bb) { gimple orig_phi, new_phi; @@ -511,17 +530,23 @@ slpeel_update_phi_nodes_for_guard2 (edge basic_block new_merge_bb = guard_edge->dest; edge e = EDGE_SUCC (new_merge_bb, 0); basic_block update_bb = e->dest; - edge new_exit_e; + edge new_exit_e, scalar_e = NULL; tree orig_def, orig_def_new_name; tree new_name, new_name2; tree arg; - gimple_stmt_iterator gsi; + gimple_stmt_iterator gsi, gsi_scalar = gsi_none (); /* Create new bb between loop and new_merge_bb. */ *new_exit_bb = split_edge (single_exit (loop)); new_exit_e = EDGE_SUCC (*new_exit_bb, 0); + if (scalar_loop != NULL) + { + scalar_e = single_exit (scalar_loop); + gsi_scalar = gsi_start_phis (scalar_e->dest); + } + for (gsi = gsi_start_phis (update_bb); !gsi_end_p (gsi); gsi_next (&gsi)) { tree new_res; @@ -532,7 +557,16 @@ slpeel_update_phi_nodes_for_guard2 (edge out of the loop - the phi arg is a constant. */ if (TREE_CODE (orig_def) != SSA_NAME) continue; - orig_def_new_name = get_current_def (orig_def); + if (scalar_loop != NULL) + { + orig_def_new_name + = PHI_ARG_DEF_FROM_EDGE (gsi_stmt (gsi_scalar), scalar_e); + gcc_assert (TREE_CODE (orig_def_new_name) == SSA_NAME); + orig_def_new_name = get_current_def (orig_def_new_name); + gsi_next (&gsi_scalar); + } + else + orig_def_new_name = get_current_def (orig_def); arg = NULL_TREE; /** 1. Handle new-merge-point phis **/ @@ -693,7 +727,8 @@ slpeel_make_loop_iterate_ntimes (struct on E which is either the entry or exit of LOOP. */ struct loop * -slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, edge e) +slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop, + struct loop *scalar_loop, edge e) { struct loop *new_loop; basic_block *new_bbs, *bbs; @@ -707,19 +742,22 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( if (!at_exit && e != loop_preheader_edge (loop)) return NULL; - bbs = XNEWVEC (basic_block, loop->num_nodes + 1); - get_loop_body_with_size (loop, bbs, loop->num_nodes); + if (scalar_loop == NULL) + scalar_loop = loop; + + bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); + get_loop_body_with_size (scalar_loop, bbs, scalar_loop->num_nodes); /* Check whether duplication is possible. */ - if (!can_copy_bbs_p (bbs, loop->num_nodes)) + if (!can_copy_bbs_p (bbs, scalar_loop->num_nodes)) { free (bbs); return NULL; } /* Generate new loop structure. */ - new_loop = duplicate_loop (loop, loop_outer (loop)); - duplicate_subloops (loop, new_loop); + new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop)); + duplicate_subloops (scalar_loop, new_loop); exit_dest = exit->dest; was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS, @@ -729,35 +767,66 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( /* Also copy the pre-header, this avoids jumping through hoops to duplicate the loop entry PHI arguments. Create an empty pre-header unconditionally for this. */ - basic_block preheader = split_edge (loop_preheader_edge (loop)); + basic_block preheader = split_edge (loop_preheader_edge (scalar_loop)); edge entry_e = single_pred_edge (preheader); - bbs[loop->num_nodes] = preheader; - new_bbs = XNEWVEC (basic_block, loop->num_nodes + 1); + bbs[scalar_loop->num_nodes] = preheader; + new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); - copy_bbs (bbs, loop->num_nodes + 1, new_bbs, + exit = single_exit (scalar_loop); + copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs, &exit, 1, &new_exit, NULL, e->src, true); - basic_block new_preheader = new_bbs[loop->num_nodes]; + exit = single_exit (loop); + basic_block new_preheader = new_bbs[scalar_loop->num_nodes]; - add_phi_args_after_copy (new_bbs, loop->num_nodes + 1, NULL); + add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL); if (at_exit) /* Add the loop copy at exit. */ { + if (scalar_loop != loop) + { + gimple_stmt_iterator gsi; + new_exit = redirect_edge_and_branch (new_exit, exit_dest); + + for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple phi = gsi_stmt (gsi); + tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e); + location_t orig_locus + = gimple_phi_arg_location_from_edge (phi, e); + + add_phi_arg (phi, orig_arg, new_exit, orig_locus); + } + } redirect_edge_and_branch_force (e, new_preheader); flush_pending_stmts (e); set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src); if (was_imm_dom) - set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_loop->header); + set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src); /* And remove the non-necessary forwarder again. Keep the other one so we have a proper pre-header for the loop at the exit edge. */ - redirect_edge_pred (single_succ_edge (preheader), single_pred (preheader)); + redirect_edge_pred (single_succ_edge (preheader), + single_pred (preheader)); delete_basic_block (preheader); - set_immediate_dominator (CDI_DOMINATORS, loop->header, - loop_preheader_edge (loop)->src); + set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, + loop_preheader_edge (scalar_loop)->src); } else /* Add the copy at entry. */ { + if (scalar_loop != loop) + { + /* Remove the non-necessary forwarder of scalar_loop again. */ + redirect_edge_pred (single_succ_edge (preheader), + single_pred (preheader)); + delete_basic_block (preheader); + set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, + loop_preheader_edge (scalar_loop)->src); + preheader = split_edge (loop_preheader_edge (loop)); + entry_e = single_pred_edge (preheader); + } + redirect_edge_and_branch_force (entry_e, new_preheader); flush_pending_stmts (entry_e); set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src); @@ -768,15 +837,39 @@ slpeel_tree_duplicate_loop_to_edge_cfg ( /* And remove the non-necessary forwarder again. Keep the other one so we have a proper pre-header for the loop at the exit edge. */ - redirect_edge_pred (single_succ_edge (new_preheader), single_pred (new_preheader)); + redirect_edge_pred (single_succ_edge (new_preheader), + single_pred (new_preheader)); delete_basic_block (new_preheader); set_immediate_dominator (CDI_DOMINATORS, new_loop->header, loop_preheader_edge (new_loop)->src); } - for (unsigned i = 0; i < loop->num_nodes+1; i++) + for (unsigned i = 0; i < scalar_loop->num_nodes + 1; i++) rename_variables_in_bb (new_bbs[i]); + if (scalar_loop != loop) + { + /* Update new_loop->header PHIs, so that on the preheader + edge they are the ones from loop rather than scalar_loop. */ + gimple_stmt_iterator gsi_orig, gsi_new; + edge orig_e = loop_preheader_edge (loop); + edge new_e = loop_preheader_edge (new_loop); + + for (gsi_orig = gsi_start_phis (loop->header), + gsi_new = gsi_start_phis (new_loop->header); + !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new); + gsi_next (&gsi_orig), gsi_next (&gsi_new)) + { + gimple orig_phi = gsi_stmt (gsi_orig); + gimple new_phi = gsi_stmt (gsi_new); + tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e); + location_t orig_locus + = gimple_phi_arg_location_from_edge (orig_phi, orig_e); + + add_phi_arg (new_phi, orig_arg, new_e, orig_locus); + } + } + free (new_bbs); free (bbs); @@ -1028,8 +1121,8 @@ set_prologue_iterations (basic_block bb_ FORNOW the resulting code will not be in loop-closed-ssa form. */ -static struct loop* -slpeel_tree_peel_loop_to_edge (struct loop *loop, +static struct loop * +slpeel_tree_peel_loop_to_edge (struct loop *loop, struct loop *scalar_loop, edge e, tree *first_niters, tree niters, bool update_first_loop_count, unsigned int th, bool check_profitability, @@ -1114,7 +1207,8 @@ slpeel_tree_peel_loop_to_edge (struct lo orig_exit_bb: */ - if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, e))) + if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, + e))) { loop_loc = find_loop_location (loop); dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, @@ -1291,7 +1385,7 @@ slpeel_tree_peel_loop_to_edge (struct lo inverse_probability (first_guard_probability)); scale_loop_profile (first_loop, first_guard_probability, check_profitability && (int)th > bound1 ? th : bound1); - slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop, + slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop, scalar_loop, first_loop == new_loop, &new_exit_bb); @@ -1331,7 +1425,7 @@ slpeel_tree_peel_loop_to_edge (struct lo bb_after_second_loop, bb_before_first_loop, inverse_probability (second_guard_probability)); scale_loop_profile (second_loop, probability_of_second_loop, bound2); - slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop, + slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop, scalar_loop, second_loop == new_loop, &new_exit_bb); /* 4. Make first-loop iterate FIRST_NITERS times, if requested. @@ -1755,6 +1849,7 @@ vect_do_peeling_for_loop_bound (loop_vec { tree ni_name, ratio_mult_vf_name; struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); struct loop *new_loop; edge update_e; basic_block preheader; @@ -1780,11 +1875,12 @@ vect_do_peeling_for_loop_bound (loop_vec loop_num = loop->num; - new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop), - &ratio_mult_vf_name, ni_name, false, - th, check_profitability, - cond_expr, cond_expr_stmt_list, - 0, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); + new_loop + = slpeel_tree_peel_loop_to_edge (loop, scalar_loop, single_exit (loop), + &ratio_mult_vf_name, ni_name, false, + th, check_profitability, + cond_expr, cond_expr_stmt_list, + 0, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); gcc_assert (new_loop); gcc_assert (loop_num == loop->num); #ifdef ENABLE_CHECKING @@ -2017,6 +2113,7 @@ vect_do_peeling_for_alignment (loop_vec_ unsigned int th, bool check_profitability) { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); tree niters_of_prolog_loop, ni_name; tree n_iters; tree wide_prolog_niters; @@ -2038,11 +2135,11 @@ vect_do_peeling_for_alignment (loop_vec_ /* Peel the prolog loop and iterate it niters_of_prolog_loop. */ new_loop = - slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop), + slpeel_tree_peel_loop_to_edge (loop, scalar_loop, + loop_preheader_edge (loop), &niters_of_prolog_loop, ni_name, true, th, check_profitability, NULL_TREE, NULL, - bound, - 0); + bound, 0); gcc_assert (new_loop); #ifdef ENABLE_CHECKING @@ -2398,6 +2495,7 @@ vect_loop_versioning (loop_vec_info loop unsigned int th, bool check_profitability) { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); + struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); basic_block condition_bb; gimple_stmt_iterator gsi, cond_exp_gsi; basic_block merge_bb; @@ -2433,8 +2531,45 @@ vect_loop_versioning (loop_vec_info loop gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); initialize_original_copy_tables (); - loop_version (loop, cond_expr, &condition_bb, - prob, prob, REG_BR_PROB_BASE - prob, true); + if (scalar_loop) + { + edge scalar_e; + basic_block preheader, scalar_preheader; + + /* We don't want to scale SCALAR_LOOP's frequencies, we need to + scale LOOP's frequencies instead. */ + loop_version (scalar_loop, cond_expr, &condition_bb, + prob, REG_BR_PROB_BASE, REG_BR_PROB_BASE - prob, true); + scale_loop_frequencies (loop, prob, REG_BR_PROB_BASE); + /* CONDITION_BB was created above SCALAR_LOOP's preheader, + while we need to move it above LOOP's preheader. */ + e = loop_preheader_edge (loop); + scalar_e = loop_preheader_edge (scalar_loop); + gcc_assert (gimple_seq_empty_p (bb_seq (e->src)) + && gimple_seq_empty_p (phi_nodes (e->src)) + && single_pred_p (e->src)); + gcc_assert (gimple_seq_empty_p (bb_seq (scalar_e->src)) + && gimple_seq_empty_p (phi_nodes (scalar_e->src)) + && single_pred_p (scalar_e->src)); + gcc_assert (single_pred_p (condition_bb)); + preheader = e->src; + scalar_preheader = scalar_e->src; + scalar_e = find_edge (condition_bb, scalar_preheader); + e = single_pred_edge (preheader); + redirect_edge_and_branch_force (single_pred_edge (condition_bb), + scalar_preheader); + redirect_edge_and_branch_force (scalar_e, preheader); + redirect_edge_and_branch_force (e, condition_bb); + set_immediate_dominator (CDI_DOMINATORS, condition_bb, + single_pred (condition_bb)); + set_immediate_dominator (CDI_DOMINATORS, scalar_preheader, + single_pred (scalar_preheader)); + set_immediate_dominator (CDI_DOMINATORS, preheader, + condition_bb); + } + else + loop_version (loop, cond_expr, &condition_bb, + prob, prob, REG_BR_PROB_BASE - prob, true); if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOC && dump_enabled_p ()) @@ -2457,24 +2592,29 @@ vect_loop_versioning (loop_vec_info loop basic block (i.e. it has two predecessors). Just in order to simplify following transformations in the vectorizer, we fix this situation here by adding a new (empty) block on the exit-edge of the loop, - with the proper loop-exit phis to maintain loop-closed-form. */ + with the proper loop-exit phis to maintain loop-closed-form. + If loop versioning wasn't done from loop, but scalar_loop instead, + merge_bb will have already just a single successor. */ merge_bb = single_exit (loop)->dest; - gcc_assert (EDGE_COUNT (merge_bb->preds) == 2); - new_exit_bb = split_edge (single_exit (loop)); - new_exit_e = single_exit (loop); - e = EDGE_SUCC (new_exit_bb, 0); - - for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi)) + if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2) { - tree new_res; - orig_phi = gsi_stmt (gsi); - new_res = copy_ssa_name (PHI_RESULT (orig_phi), NULL); - new_phi = create_phi_node (new_res, new_exit_bb); - arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); - add_phi_arg (new_phi, arg, new_exit_e, - gimple_phi_arg_location_from_edge (orig_phi, e)); - adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); + gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2); + new_exit_bb = split_edge (single_exit (loop)); + new_exit_e = single_exit (loop); + e = EDGE_SUCC (new_exit_bb, 0); + + for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + tree new_res; + orig_phi = gsi_stmt (gsi); + new_res = copy_ssa_name (PHI_RESULT (orig_phi), NULL); + new_phi = create_phi_node (new_res, new_exit_bb); + arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); + add_phi_arg (new_phi, arg, new_exit_e, + gimple_phi_arg_location_from_edge (orig_phi, e)); + adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); + } } /* End loop-exit-fixes after versioning. */ --- gcc/tree-vectorizer.c.jj 2013-10-11 14:32:57.082909767 +0200 +++ gcc/tree-vectorizer.c 2013-10-14 15:34:19.921860478 +0200 @@ -306,6 +306,43 @@ vect_destroy_datarefs (loop_vec_info loo } +/* If LOOP has been versioned during ifcvt, return the internal call + guarding it. */ + +static gimple +vect_loop_vectorized_call (struct loop *loop) +{ + basic_block bb = loop_preheader_edge (loop)->src; + gimple g; + do + { + g = last_stmt (bb); + if (g) + break; + if (!single_pred_p (bb)) + break; + bb = single_pred (bb); + } + while (1); + if (g && gimple_code (g) == GIMPLE_COND) + { + gimple_stmt_iterator gsi = gsi_for_stmt (g); + gsi_prev (&gsi); + if (!gsi_end_p (gsi)) + { + g = gsi_stmt (gsi); + if (is_gimple_call (g) + && gimple_call_internal_p (g) + && gimple_call_internal_fn (g) == IFN_LOOP_VECTORIZED + && (tree_low_cst (gimple_call_arg (g, 0), 0) == loop->num + || tree_low_cst (gimple_call_arg (g, 1), 0) == loop->num)) + return g; + } + } + return NULL; +} + + /* Function vectorize_loops. Entry point to loop vectorization phase. */ @@ -320,6 +357,8 @@ vectorize_loops (void) struct loop *loop; hash_table <simduid_to_vf> simduid_to_vf_htab; hash_table <simd_array_to_simduid> simd_array_to_simduid_htab; + bool any_ifcvt_loops = false; + unsigned ret = 0; vect_loops_num = number_of_loops (cfun); @@ -342,8 +381,11 @@ vectorize_loops (void) than all previously defined loops. This fact allows us to run only over initial loops skipping newly generated ones. */ FOR_EACH_LOOP (li, loop, 0) - if ((flag_tree_loop_vectorize && optimize_loop_nest_for_speed_p (loop)) - || loop->force_vect) + if (loop->dont_vectorize) + any_ifcvt_loops = true; + else if ((flag_tree_loop_vectorize + && optimize_loop_nest_for_speed_p (loop)) + || loop->force_vect) { loop_vec_info loop_vinfo; vect_location = find_loop_location (loop); @@ -361,6 +403,38 @@ vectorize_loops (void) if (!dbg_cnt (vect_loop)) break; + gimple loop_vectorized_call = vect_loop_vectorized_call (loop); + if (loop_vectorized_call) + { + tree arg = gimple_call_arg (loop_vectorized_call, 1); + basic_block *bbs; + unsigned int i; + struct loop *scalar_loop = get_loop (cfun, tree_low_cst (arg, 0)); + + LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = scalar_loop; + gcc_checking_assert (vect_loop_vectorized_call + (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) + == loop_vectorized_call); + bbs = get_loop_body (scalar_loop); + for (i = 0; i < scalar_loop->num_nodes; i++) + { + basic_block bb = bbs[i]; + gimple_stmt_iterator gsi; + for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple phi = gsi_stmt (gsi); + gimple_set_uid (phi, 0); + } + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple stmt = gsi_stmt (gsi); + gimple_set_uid (stmt, 0); + } + } + free (bbs); + } if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOC && dump_enabled_p ()) dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, @@ -381,6 +455,25 @@ vectorize_loops (void) *simduid_to_vf_htab.find_slot (simduid_to_vf_data, INSERT) = simduid_to_vf_data; } + + if (loop_vectorized_call) + { + gimple g = loop_vectorized_call; + tree lhs = gimple_call_lhs (g); + gimple_stmt_iterator gsi = gsi_for_stmt (g); + gimplify_and_update_call_from_tree (&gsi, boolean_true_node); + gsi_next (&gsi); + if (!gsi_end_p (gsi)) + { + g = gsi_stmt (gsi); + if (gimple_code (g) == GIMPLE_COND + && gimple_cond_lhs (g) == lhs) + { + gimple_cond_set_lhs (g, boolean_true_node); + update_stmt (g); + } + } + } } vect_location = UNKNOWN_LOC; @@ -394,6 +487,34 @@ vectorize_loops (void) /* ----------- Finalize. ----------- */ + if (any_ifcvt_loops) + for (i = 1; i < vect_loops_num; i++) + { + loop = get_loop (cfun, i); + if (loop && loop->dont_vectorize) + { + gimple g = vect_loop_vectorized_call (loop); + if (g) + { + tree lhs = gimple_call_lhs (g); + gimple_stmt_iterator gsi = gsi_for_stmt (g); + gimplify_and_update_call_from_tree (&gsi, boolean_false_node); + gsi_next (&gsi); + if (!gsi_end_p (gsi)) + { + g = gsi_stmt (gsi); + if (gimple_code (g) == GIMPLE_COND + && gimple_cond_lhs (g) == lhs) + { + gimple_cond_set_lhs (g, boolean_false_node); + update_stmt (g); + } + } + ret = TODO_cleanup_cfg; + } + } + } + for (i = 1; i < vect_loops_num; i++) { loop_vec_info loop_vinfo; @@ -451,7 +572,7 @@ vectorize_loops (void) return TODO_cleanup_cfg; } - return 0; + return ret; } --- gcc/tree-vectorizer.h.jj 2013-10-11 14:32:57.086909746 +0200 +++ gcc/tree-vectorizer.h 2013-10-14 14:32:55.538688209 +0200 @@ -314,6 +314,10 @@ typedef struct _loop_vec_info { fix it up. */ bool operands_swapped; + /* If if-conversion versioned this loop before conversion, this is the + loop version without if-conversion. */ + struct loop *scalar_loop; + } *loop_vec_info; /* Access Functions. */ @@ -345,6 +349,7 @@ typedef struct _loop_vec_info { #define LOOP_VINFO_TARGET_COST_DATA(L) (L)->target_cost_data #define LOOP_VINFO_PEELING_FOR_GAPS(L) (L)->peeling_for_gaps #define LOOP_VINFO_OPERANDS_SWAPPED(L) (L)->operands_swapped +#define LOOP_VINFO_SCALAR_LOOP(L) (L)->scalar_loop #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \ (L)->may_misalign_stmts.length () > 0 @@ -899,7 +904,8 @@ extern LOC vect_location; in tree-vect-loop-manip.c. */ extern void slpeel_make_loop_iterate_ntimes (struct loop *, tree); extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge); -struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, edge); +struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *, + struct loop *, edge); extern void vect_loop_versioning (loop_vec_info, unsigned int, bool); extern void vect_do_peeling_for_loop_bound (loop_vec_info, tree *, unsigned int, bool); --- gcc/cfgloop.h.jj 2013-10-11 14:32:57.089909730 +0200 +++ gcc/cfgloop.h 2013-10-11 17:23:58.706526905 +0200 @@ -177,6 +177,9 @@ struct GTY ((chain_next ("%h.next"))) lo /* True if we should try harder to vectorize this loop. */ bool force_vect; + /* True if this loop should never be vectorized. */ + bool dont_vectorize; + /* For SIMD loops, this is a unique identifier of the loop, referenced by IFN_GOMP_SIMD_VF, IFN_GOMP_SIMD_LANE and IFN_GOMP_SIMD_LAST_LANE builtins. */ --- gcc/tree-loop-distribution.c.jj 2013-10-07 15:06:40.000000000 +0200 +++ gcc/tree-loop-distribution.c 2013-10-14 14:33:22.448549212 +0200 @@ -673,7 +673,7 @@ copy_loop_before (struct loop *loop) edge preheader = loop_preheader_edge (loop); initialize_original_copy_tables (); - res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader); + res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader); gcc_assert (res != NULL); free_original_copy_tables (); delete_update_ssa (); --- gcc/internal-fn.c.jj 2013-10-11 14:32:57.092909715 +0200 +++ gcc/internal-fn.c 2013-10-11 17:23:58.706526905 +0200 @@ -133,6 +133,14 @@ expand_GOMP_SIMD_LAST_LANE (gimple stmt gcc_unreachable (); } +/* This should get folded in tree-vectorizer.c. */ + +static void +expand_LOOP_VECTORIZED (gimple stmt ATTRIBUTE_UNUSED) +{ + gcc_unreachable (); +} + /* Routines to expand each internal function, indexed by function number. Each routine has the prototype: --- gcc/tree-if-conv.c.jj 2013-10-11 14:32:57.095909699 +0200 +++ gcc/tree-if-conv.c 2013-10-11 17:23:58.707526969 +0200 @@ -1735,6 +1735,48 @@ combine_blocks (struct loop *loop) ifc_bbs = NULL; } +static bool +version_loop_for_if_conversion (struct loop *loop) +{ + basic_block cond_bb; + tree cond = make_ssa_name (boolean_type_node, NULL); + struct loop *new_loop; + gimple g; + gimple_stmt_iterator gsi; + void **aux = XNEWVEC (void *, loop->num_nodes); + unsigned int i; + + /* We have data stored in bb->aux, but loop_version also + uses it, so save it temporarily and restore after loop_version. */ + for (i = 0; i < loop->num_nodes; i++) + { + aux[i] = ifc_bbs[i]->aux; + ifc_bbs[i]->aux = NULL; + } + g = gimple_build_call_internal (IFN_LOOP_VECTORIZED, 2, + build_int_cst (integer_type_node, loop->num), + integer_zero_node); + gimple_call_set_lhs (g, cond); + + initialize_original_copy_tables (); + new_loop = loop_version (loop, cond, &cond_bb, + REG_BR_PROB_BASE, REG_BR_PROB_BASE, + REG_BR_PROB_BASE, true); + free_original_copy_tables (); + for (i = 0; i < loop->num_nodes; i++) + ifc_bbs[i]->aux = aux[i]; + XDELETEVEC (aux); + if (new_loop == NULL) + return false; + new_loop->dont_vectorize = true; + new_loop->force_vect = false; + gsi = gsi_last_bb (cond_bb); + gimple_call_set_arg (g, 1, build_int_cst (integer_type_node, new_loop->num)); + gsi_insert_before (&gsi, g, GSI_SAME_STMT); + update_ssa (TODO_update_ssa); + return true; +} + /* If-convert LOOP when it is legal. For the moment this pass has no profitability analysis. Returns true when something changed. */ @@ -1744,10 +1786,18 @@ tree_if_conversion (struct loop *loop) bool changed = false; ifc_bbs = NULL; + if (loop->dont_vectorize) + goto cleanup; + if (!if_convertible_loop_p (loop) || !dbg_cnt (if_conversion_tree)) goto cleanup; + if ((flag_tree_loop_vectorize || loop->force_vect) + && flag_tree_loop_if_convert == -1 + && !version_loop_for_if_conversion (loop)) + goto cleanup; + /* Now all statements are if-convertible. Combine all the basic blocks into one huge basic block doing the if-conversion on-the-fly. */ --- gcc/testsuite/gcc.dg/vect/vect-cond-11.c.jj 2013-10-15 14:01:07.877814190 +0200 +++ gcc/testsuite/gcc.dg/vect/vect-cond-11.c 2013-10-15 14:02:29.302414970 +0200 @@ -0,0 +1,116 @@ +#include "tree-vect.h" + +#define N 1024 +typedef int V __attribute__((vector_size (4))); +unsigned int a[N * 2] __attribute__((aligned)); +unsigned int b[N * 2] __attribute__((aligned)); +V c[N]; + +__attribute__((noinline, noclone)) unsigned int +foo (unsigned int *a, unsigned int *b) +{ + int i; + unsigned int r = 0; + for (i = 0; i < N; i++) + { + unsigned int x = a[i], y = b[i]; + if (x < 32) + { + x = x + 127; + y = y * 2; + } + else + { + x = x - 16; + y = y + 1; + } + a[i] = x; + b[i] = y; + r += x; + } + return r; +} + +__attribute__((noinline, noclone)) unsigned int +bar (unsigned int *a, unsigned int *b) +{ + int i; + unsigned int r = 0; + for (i = 0; i < N; i++) + { + unsigned int x = a[i], y = b[i]; + if (x < 32) + { + x = x + 127; + y = y * 2; + } + else + { + x = x - 16; + y = y + 1; + } + a[i] = x; + b[i] = y; + c[i] = c[i] + 1; + r += x; + } + return r; +} + +void +baz (unsigned int *a, unsigned int *b, + unsigned int (*fn) (unsigned int *, unsigned int *)) +{ + int i; + for (i = -64; i < 0; i++) + { + a[i] = 19; + b[i] = 17; + } + for (; i < N; i++) + { + a[i] = i - 512; + b[i] = i; + } + for (; i < N + 64; i++) + { + a[i] = 27; + b[i] = 19; + } + if (fn (a, b) != -512U - (N - 32) * 16U + 32 * 127U) + __builtin_abort (); + for (i = -64; i < 0; i++) + if (a[i] != 19 || b[i] != 17) + __builtin_abort (); + for (; i < N; i++) + if (a[i] != (i - 512U < 32U ? i - 512U + 127 : i - 512U - 16) + || b[i] != (i - 512U < 32U ? i * 2U : i + 1U)) + __builtin_abort (); + for (; i < N + 64; i++) + if (a[i] != 27 || b[i] != 19) + __builtin_abort (); +} + +int +main () +{ + int i; + check_vect (); + baz (a + 512, b + 512, foo); + baz (a + 512, b + 512, bar); + baz (a + 512 + 1, b + 512 + 1, foo); + baz (a + 512 + 1, b + 512 + 1, bar); + baz (a + 512 + 31, b + 512 + 31, foo); + baz (a + 512 + 31, b + 512 + 31, bar); + baz (a + 512 + 1, b + 512, foo); + baz (a + 512 + 1, b + 512, bar); + baz (a + 512 + 31, b + 512, foo); + baz (a + 512 + 31, b + 512, bar); + baz (a + 512, b + 512 + 1, foo); + baz (a + 512, b + 512 + 1, bar); + baz (a + 512, b + 512 + 31, foo); + baz (a + 512, b + 512 + 31, bar); + return 0; +} + +/* { dg-final { cleanup-tree-dump "vect" } } */ --- gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c.jj 2013-08-30 14:38:40.000000000 +0200 +++ gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c 2013-10-14 13:19:21.704256653 +0200 @@ -1,4 +1,5 @@ /* { dg-require-effective-target vect_condition } */ +/* { dg-additional-options "-ftree-loop-if-convert" } */ #include "tree-vect.h" --- gcc/testsuite/gcc.dg/vect/bb-slp-pattern-2.c.jj 2013-08-30 14:38:40.000000000 +0200 +++ gcc/testsuite/gcc.dg/vect/bb-slp-pattern-2.c 2013-10-14 13:19:35.678195952 +0200 @@ -1,4 +1,5 @@ /* { dg-require-effective-target vect_condition } */ +/* { dg-additional-options "-ftree-loop-if-convert" } */ #include "tree-vect.h"