From patchwork Wed Aug 5 12:06:03 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Richard Biener X-Patchwork-Id: 1341213 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=gcc.gnu.org (client-ip=8.43.85.97; helo=sourceware.org; envelope-from=gcc-patches-bounces@gcc.gnu.org; receiver=) Authentication-Results: ozlabs.org; dmarc=none (p=none dis=none) header.from=suse.de Received: from sourceware.org (server2.sourceware.org [8.43.85.97]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 4BM9M76QKhz9s1x for ; Wed, 5 Aug 2020 22:06:10 +1000 (AEST) Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 17B44385040B; Wed, 5 Aug 2020 12:06:07 +0000 (GMT) X-Original-To: gcc-patches@gcc.gnu.org Delivered-To: gcc-patches@gcc.gnu.org Received: from mx2.suse.de (mx2.suse.de [195.135.220.15]) by sourceware.org (Postfix) with ESMTPS id 0D6103857C40 for ; Wed, 5 Aug 2020 12:06:04 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 0D6103857C40 Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=suse.de Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=rguenther@suse.de X-Virus-Scanned: by amavisd-new at test-mx.suse.de Received: from relay2.suse.de (unknown [195.135.221.27]) by mx2.suse.de (Postfix) with ESMTP id AEC42AE28 for ; Wed, 5 Aug 2020 12:06:19 +0000 (UTC) Date: Wed, 5 Aug 2020 14:06:03 +0200 (CEST) From: Richard Biener To: gcc-patches@gcc.gnu.org Subject: [PATCH] refactor LIM a bit Message-ID: User-Agent: Alpine 2.21 (LSU 202 2017-01-01) MIME-Version: 1.0 X-Spam-Status: No, score=-10.2 required=5.0 tests=BAYES_00, GIT_PATCH_0, KAM_DMARC_STATUS, RCVD_IN_MSPIKE_H3, RCVD_IN_MSPIKE_WL, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: gcc-patches-bounces@gcc.gnu.org Sender: "Gcc-patches" This refactors LIM to eschew alloc_aux_for_edges and re-uses the RPO order of the move_computations walk for invariantness computation as well. It also removes one unnecessary sorting (but retaining it as checking code because we bsearch the vector) and moves edge insert commit code to the place where it doesn't have to scan all the functions edges. This was all done when investigating whether LIM can be refactored to work on a specific loop for on-demand processing (but we're not there yet). Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed. 2020-08-05 Richard Biener * tree-ssa-loop-im.c (invariantness_dom_walker): Remove. (invariantness_dom_walker::before_dom_children): Move to ... (compute_invariantness): ... this function. (move_computations): Inline ... (tree_ssa_lim): ... here, share RPO order and avoid some cfun references. (analyze_memory_references): Remove sorting of location lists, instead assert they are sorted already when checking. (prev_flag_edges): Remove. (execute_sm_if_changed): Pass down and adjust prev edge state. (execute_sm_exit): Likewise. (hoist_memory_references): Likewise. Commit edge insertions of each processed exit. (store_motion_loop): Do not commit edge insertions on all edges in the function. (tree_ssa_lim_initialize): Do not call alloc_aux_for_edges. (tree_ssa_lim_finalize): Do not call free_aux_for_edges. --- gcc/tree-ssa-loop-im.c | 153 ++++++++++++++++------------------------- 1 file changed, 58 insertions(+), 95 deletions(-) diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index d33f5335e2b..35da1fb26a6 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -37,7 +37,6 @@ along with GCC; see the file COPYING3. If not see #include "tree-ssa-loop.h" #include "tree-into-ssa.h" #include "cfgloop.h" -#include "domwalk.h" #include "tree-affine.h" #include "tree-ssa-propagate.h" #include "trans-mem.h" @@ -970,25 +969,12 @@ rewrite_bittest (gimple_stmt_iterator *bsi) return stmt; } -/* For each statement determines the outermost loop in that it is invariant, - - statements on whose motion it depends and the cost of the computation. - - This information is stored to the LIM_DATA structure associated with - - each statement. */ -class invariantness_dom_walker : public dom_walker -{ -public: - invariantness_dom_walker (cdi_direction direction) - : dom_walker (direction) {} - - virtual edge before_dom_children (basic_block); -}; - /* Determine the outermost loops in that statements in basic block BB are - invariant, and record them to the LIM_DATA associated with the statements. - Callback for dom_walker. */ + invariant, and record them to the LIM_DATA associated with the + statements. */ -edge -invariantness_dom_walker::before_dom_children (basic_block bb) +static void +compute_invariantness (basic_block bb) { enum move_pos pos; gimple_stmt_iterator bsi; @@ -998,7 +984,7 @@ invariantness_dom_walker::before_dom_children (basic_block bb) struct lim_aux_data *lim_data; if (!loop_outer (bb->loop_father)) - return NULL; + return; if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "Basic block %d (loop %d -- depth %d):\n\n", @@ -1122,7 +1108,6 @@ invariantness_dom_walker::before_dom_children (basic_block bb) if (lim_data->cost >= LIM_EXPENSIVE) set_profitable_level (stmt); } - return NULL; } /* Hoist the statements in basic block BB out of the loops prescribed by @@ -1289,28 +1274,6 @@ move_computations_worker (basic_block bb) return todo; } -/* Hoist the statements out of the loops prescribed by data stored in - LIM_DATA structures associated with each statement.*/ - -static unsigned int -move_computations (void) -{ - int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun)); - int n = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, false); - unsigned todo = 0; - - for (int i = 0; i < n; ++i) - todo |= move_computations_worker (BASIC_BLOCK_FOR_FN (cfun, rpo[i])); - - free (rpo); - - gsi_commit_edge_inserts (); - if (need_ssa_update_p (cfun)) - rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); - - return todo; -} - /* Checks whether the statement defining variable *INDEX can be hoisted out of the loop passed in DATA. Callback for for_each_index. */ @@ -1678,7 +1641,9 @@ analyze_memory_references (void) bb_loop_postorder); /* Visit blocks in loop postorder and assign mem-ref IDs in that order. - That results in better locality for all the bitmaps. */ + That results in better locality for all the bitmaps. It also + automatically sorts the location list of gathered memory references + after their loop postorder number allowing to binary-search it. */ for (i = 0; i < n; ++i) { basic_block bb = bbs[i]; @@ -1686,12 +1651,17 @@ analyze_memory_references (void) gather_mem_refs_stmt (bb->loop_father, gsi_stmt (bsi)); } - /* Sort the location list of gathered memory references after their + /* Verify the list of gathered memory references is sorted after their loop postorder number. */ - im_mem_ref *ref; - FOR_EACH_VEC_ELT (memory_accesses.refs_list, i, ref) - ref->accesses_in_loop.sort (sort_locs_in_loop_postorder_cmp, - bb_loop_postorder); + if (flag_checking) + { + im_mem_ref *ref; + FOR_EACH_VEC_ELT (memory_accesses.refs_list, i, ref) + for (unsigned j = 1; j < ref->accesses_in_loop.length (); ++j) + gcc_assert (sort_locs_in_loop_postorder_cmp + (&ref->accesses_in_loop[j-1], &ref->accesses_in_loop[j], + bb_loop_postorder) <= 0); + } free (bbs); @@ -1865,14 +1835,6 @@ first_mem_ref_loc (class loop *loop, im_mem_ref *ref) return locp; } -struct prev_flag_edges { - /* Edge to insert new flag comparison code. */ - edge append_cond_position; - - /* Edge for fall through from previous flag comparison. */ - edge last_cond_fallthru; -}; - /* Helper function for execute_sm. Emit code to store TMP_VAR into MEM along edge EX. @@ -1920,14 +1882,14 @@ struct prev_flag_edges { static void execute_sm_if_changed (edge ex, tree mem, tree tmp_var, tree flag, - edge preheader, hash_set *flag_bbs) + edge preheader, hash_set *flag_bbs, + edge &append_cond_position, edge &last_cond_fallthru) { basic_block new_bb, then_bb, old_dest; bool loop_has_only_one_exit; - edge then_old_edge, orig_ex = ex; + edge then_old_edge; gimple_stmt_iterator gsi; gimple *stmt; - struct prev_flag_edges *prev_edges = (struct prev_flag_edges *) ex->aux; bool irr = ex->flags & EDGE_IRREDUCIBLE_LOOP; profile_count count_sum = profile_count::zero (); @@ -1975,8 +1937,8 @@ execute_sm_if_changed (edge ex, tree mem, tree tmp_var, tree flag, /* ?? Insert store after previous store if applicable. See note below. */ - if (prev_edges) - ex = prev_edges->append_cond_position; + if (append_cond_position) + ex = append_cond_position; loop_has_only_one_exit = single_pred_p (ex->dest); @@ -2033,10 +1995,10 @@ execute_sm_if_changed (edge ex, tree mem, tree tmp_var, tree flag, set_immediate_dominator (CDI_DOMINATORS, then_bb, new_bb); - if (prev_edges) + if (append_cond_position) { - basic_block prevbb = prev_edges->last_cond_fallthru->src; - redirect_edge_succ (prev_edges->last_cond_fallthru, new_bb); + basic_block prevbb = last_cond_fallthru->src; + redirect_edge_succ (last_cond_fallthru, new_bb); set_immediate_dominator (CDI_DOMINATORS, new_bb, prevbb); set_immediate_dominator (CDI_DOMINATORS, old_dest, recompute_dominator (CDI_DOMINATORS, old_dest)); @@ -2046,17 +2008,8 @@ execute_sm_if_changed (edge ex, tree mem, tree tmp_var, tree flag, sequence they originally happened. Save the position right after the (_lsm) store we just created so we can continue appending after it and maintain the original order. */ - { - struct prev_flag_edges *p; - - if (orig_ex->aux) - orig_ex->aux = NULL; - alloc_aux_for_edge (orig_ex, sizeof (struct prev_flag_edges)); - p = (struct prev_flag_edges *) orig_ex->aux; - p->append_cond_position = then_old_edge; - p->last_cond_fallthru = find_edge (new_bb, old_dest); - orig_ex->aux = (void *) p; - } + append_cond_position = then_old_edge; + last_cond_fallthru = find_edge (new_bb, old_dest); if (!loop_has_only_one_exit) for (gphi_iterator gpi = gsi_start_phis (old_dest); @@ -2222,7 +2175,8 @@ struct seq_entry static void execute_sm_exit (class loop *loop, edge ex, vec &seq, - hash_map &aux_map, sm_kind kind) + hash_map &aux_map, sm_kind kind, + edge &append_cond_position, edge &last_cond_fallthru) { /* Sink the stores to exit from the loop. */ for (unsigned i = seq.length (); i > 0; --i) @@ -2255,7 +2209,8 @@ execute_sm_exit (class loop *loop, edge ex, vec &seq, else execute_sm_if_changed (ex, ref->mem.ref, aux->tmp_var, aux->store_flag, - loop_preheader_edge (loop), &aux->flag_bbs); + loop_preheader_edge (loop), &aux->flag_bbs, + append_cond_position, last_cond_fallthru); } } } @@ -2647,14 +2602,21 @@ hoist_memory_references (class loop *loop, bitmap mem_refs, /* Materialize ordered store sequences on exits. */ FOR_EACH_VEC_ELT (exits, i, e) { + edge append_cond_position = NULL; + edge last_cond_fallthru = NULL; if (i < sms.length ()) { gcc_assert (sms[i].first == e); - execute_sm_exit (loop, e, sms[i].second, aux_map, sm_ord); + execute_sm_exit (loop, e, sms[i].second, aux_map, sm_ord, + append_cond_position, last_cond_fallthru); sms[i].second.release (); } if (!unord_refs.is_empty ()) - execute_sm_exit (loop, e, unord_refs, aux_map, sm_unord); + execute_sm_exit (loop, e, unord_refs, aux_map, sm_unord, + append_cond_position, last_cond_fallthru); + /* Commit edge inserts here to preserve the order of stores + when an exit exits multiple loops. */ + gsi_commit_one_edge_insert (e, NULL); } for (hash_map::iterator iter = aux_map.begin (); @@ -2912,12 +2874,7 @@ store_motion_loop (class loop *loop, bitmap sm_executed) { find_refs_for_sm (loop, sm_executed, sm_in_loop); if (!bitmap_empty_p (sm_in_loop)) - { - hoist_memory_references (loop, sm_in_loop, exits); - /* Commit edge inserts here to preserve the order of stores - when an exit exits multiple loops. */ - gsi_commit_edge_inserts (); - } + hoist_memory_references (loop, sm_in_loop, exits); } exits.release (); @@ -3064,8 +3021,6 @@ tree_ssa_lim_initialize (void) if (flag_tm) compute_transaction_bits (); - alloc_aux_for_edges (0); - memory_accesses.refs = new hash_table (100); memory_accesses.refs_list.create (100); /* Allocate a special, unanalyzable mem-ref with ID zero. */ @@ -3108,8 +3063,6 @@ tree_ssa_lim_finalize (void) unsigned i; im_mem_ref *ref; - free_aux_for_edges (); - FOR_EACH_BB_FN (bb, cfun) SET_ALWAYS_EXECUTED_IN (bb, NULL); @@ -3138,9 +3091,9 @@ tree_ssa_lim_finalize (void) i.e. those that are likely to be win regardless of the register pressure. */ static unsigned int -tree_ssa_lim (void) +tree_ssa_lim (function *fun) { - unsigned int todo; + unsigned int todo = 0; tree_ssa_lim_initialize (); @@ -3150,17 +3103,27 @@ tree_ssa_lim (void) /* Fills ALWAYS_EXECUTED_IN information for basic blocks. */ fill_always_executed_in (); + int *rpo = XNEWVEC (int, last_basic_block_for_fn (fun)); + int n = pre_and_rev_post_order_compute_fn (fun, NULL, rpo, false); + /* For each statement determine the outermost loop in that it is invariant and cost for computing the invariant. */ - invariantness_dom_walker (CDI_DOMINATORS) - .walk (cfun->cfg->x_entry_block_ptr); + for (int i = 0; i < n; ++i) + compute_invariantness (BASIC_BLOCK_FOR_FN (fun, rpo[i])); /* Execute store motion. Force the necessary invariants to be moved out of the loops as well. */ do_store_motion (); /* Move the expressions that are expensive enough. */ - todo = move_computations (); + for (int i = 0; i < n; ++i) + todo |= move_computations_worker (BASIC_BLOCK_FOR_FN (fun, rpo[i])); + + free (rpo); + + gsi_commit_edge_inserts (); + if (need_ssa_update_p (fun)) + rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); tree_ssa_lim_finalize (); @@ -3207,7 +3170,7 @@ pass_lim::execute (function *fun) if (number_of_loops (fun) <= 1) return 0; - unsigned int todo = tree_ssa_lim (); + unsigned int todo = tree_ssa_lim (fun); if (!in_loop_pipeline) loop_optimizer_finalize ();