Message ID | 1288799546-29668-1-git-send-email-sebpop@gmail.com |
---|---|
State | New |
Headers | show |
On Wed, 3 Nov 2010, Sebastian Pop wrote: > 2010-10-20 Sebastian Pop <sebastian.pop@amd.com> > > PR tree-optimization/46029 > * doc/invoke.texi (-ftree-loop-if-convert-stores): Update description. > * tree-if-conv.c (has_unaligned_memory_refs): New. > (if_convertible_gimple_assign_stmt_p): Call has_unaligned_memory_refs. > (create_scratchpad): New. > (create_indirect_cond_expr): New. > (predicate_mem_writes): Call create_indirect_cond_expr. Take an extra > parameter for scratch_pad. > (combine_blocks): Same. > (tree_if_conversion): Same. > (main_tree_if_conversion): Pass to tree_if_conversion a pointer to > scratch_pad. > (struct ifc_dr): Removed. > (IFC_DR): Removed. > (DR_WRITTEN_AT_LEAST_ONCE): Removed. > (DR_RW_UNCONDITIONALLY): Removed. > (memrefs_read_or_written_unconditionally): Removed. > (write_memrefs_written_at_least_once): Removed. > (ifcvt_memrefs_wont_trap): Removed. > (ifcvt_could_trap_p): Does not take refs parameter anymore. > (if_convertible_gimple_assign_stmt_p): Same. > (if_convertible_stmt_p): Same. > (if_convertible_loop_p_1): Remove initialization of dr->aux, > DR_WRITTEN_AT_LEAST_ONCE, and DR_RW_UNCONDITIONALLY. > (if_convertible_loop_p): Remove deallocation of the same. Comments in-line > testsuite/ > * g++.dg/tree-ssa/ifc-pr46029.C: New. > * gcc.dg/tree-ssa/ifc-8.c: New. > * gcc.dg/tree-ssa/ifc-5.c: Make it exactly like the FFmpeg kernel. > --- > gcc/ChangeLog | 28 ++ > gcc/doc/invoke.texi | 18 +- > gcc/testsuite/ChangeLog | 7 + > gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C | 76 ++++++ > gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c | 17 +- > gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c | 29 ++ > gcc/tree-if-conv.c | 379 ++++++++++++--------------- > 7 files changed, 336 insertions(+), 218 deletions(-) > create mode 100644 gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c > > diff --git a/gcc/ChangeLog b/gcc/ChangeLog > index beed454..0f58882 100644 > --- a/gcc/ChangeLog > +++ b/gcc/ChangeLog > @@ -1,3 +1,31 @@ > +2010-10-20 Sebastian Pop <sebastian.pop@amd.com> > + > + PR tree-optimization/46029 > + * doc/invoke.texi (-ftree-loop-if-convert-stores): Update description. > + * tree-if-conv.c (has_unaligned_memory_refs): New. > + (if_convertible_gimple_assign_stmt_p): Call has_unaligned_memory_refs. > + (create_scratchpad): New. > + (create_indirect_cond_expr): New. > + (predicate_mem_writes): Call create_indirect_cond_expr. Take an extra > + parameter for scratch_pad. > + (combine_blocks): Same. > + (tree_if_conversion): Same. > + (main_tree_if_conversion): Pass to tree_if_conversion a pointer to > + scratch_pad. > + (struct ifc_dr): Removed. > + (IFC_DR): Removed. > + (DR_WRITTEN_AT_LEAST_ONCE): Removed. > + (DR_RW_UNCONDITIONALLY): Removed. > + (memrefs_read_or_written_unconditionally): Removed. > + (write_memrefs_written_at_least_once): Removed. > + (ifcvt_memrefs_wont_trap): Removed. > + (ifcvt_could_trap_p): Does not take refs parameter anymore. > + (if_convertible_gimple_assign_stmt_p): Same. > + (if_convertible_stmt_p): Same. > + (if_convertible_loop_p_1): Remove initialization of dr->aux, > + DR_WRITTEN_AT_LEAST_ONCE, and DR_RW_UNCONDITIONALLY. > + (if_convertible_loop_p): Remove deallocation of the same. > + > 2010-10-20 Nathan Froyd <froydnj@codesourcery.com> > > * ifcvt.c (noce_emit_cmove): If both of the values are SUBREGs, try > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi > index ee68454..28b0cbb 100644 > --- a/gcc/doc/invoke.texi > +++ b/gcc/doc/invoke.texi > @@ -6935,20 +6935,26 @@ if vectorization is enabled. > > @item -ftree-loop-if-convert-stores > Attempt to also if-convert conditional jumps containing memory writes. > -This transformation can be unsafe for multi-threaded programs as it > -transforms conditional memory writes into unconditional memory writes. > For example, > @smallexample > for (i = 0; i < N; i++) > if (cond) > - A[i] = expr; > + A[i] = B[i] + 2; > @end smallexample > would be transformed to > @smallexample > -for (i = 0; i < N; i++) > - A[i] = cond ? expr : A[i]; > +void *scratchpad = alloca (64); > +for (i = 0; i < N; i++) @{ > + a = cond ? &A[i] : scratchpad; > + b = cond ? &B[i] : scratchpad; > + *a = *b + 2; > +@} > @end smallexample > -potentially producing data races. > +The compiler allocates a scratchpad memory on the stack for each > +function in which the if-conversion of memory stores or reads > +happened. This scratchpad memory is used during the part of the > +computation that is discarded, i.e., when the condition is evaluated > +to false. > > @item -ftree-loop-distribution > Perform loop distribution. This flag can improve cache performance on > diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog > index 9d9c543..4233f86 100644 > --- a/gcc/testsuite/ChangeLog > +++ b/gcc/testsuite/ChangeLog > @@ -1,3 +1,10 @@ > +2010-10-20 Sebastian Pop <sebastian.pop@amd.com> > + > + PR tree-optimization/46029 > + * g++.dg/tree-ssa/ifc-pr46029.C: New. > + * gcc.dg/tree-ssa/ifc-8.c: New. > + * gcc.dg/tree-ssa/ifc-5.c: Make it exactly like the FFmpeg kernel. > + > 2010-10-20 Rainer Orth <ro@CeBiTec.Uni-Bielefeld.DE> > > PR c++/46024 > diff --git a/gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C b/gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C > new file mode 100644 > index 0000000..2a54bdb > --- /dev/null > +++ b/gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C > @@ -0,0 +1,76 @@ > +// { dg-do run } > +/* { dg-options "-O -ftree-loop-if-convert-stores" } */ > + > +namespace > +{ > + struct rb_tree_node_ > + { > + rb_tree_node_ ():m_p_left (0), m_p_parent (0), m_metadata (0) > + { > + } > + unsigned &get_metadata () > + { > + return m_metadata; > + } > + rb_tree_node_ *m_p_left; > + rb_tree_node_ *m_p_parent; > + unsigned m_metadata; > + }; > + > + struct bin_search_tree_const_node_it_ > + { > + bin_search_tree_const_node_it_ (rb_tree_node_ * p_nd):m_p_nd (p_nd) > + { > + } > + unsigned &get_metadata () > + { > + return m_p_nd->get_metadata (); > + } > + bin_search_tree_const_node_it_ get_l_child () > + { > + return bin_search_tree_const_node_it_ (m_p_nd->m_p_left); > + } > + > + rb_tree_node_ *m_p_nd; > + }; > + > + struct bin_search_tree_no_data_ > + { > + typedef rb_tree_node_ *node_pointer; > + bin_search_tree_no_data_ ():m_p_head (new rb_tree_node_ ()) > + { > + } > + void insert_imp_empty (int r_value) > + { > + rb_tree_node_ *p_new_node = new rb_tree_node_ (); > + m_p_head->m_p_parent = p_new_node; > + p_new_node->m_p_parent = m_p_head; > + update_to_top (m_p_head->m_p_parent); > + } > + void apply_update (bin_search_tree_const_node_it_ nd_it) > + { > + unsigned > + l_max_endpoint > + = > + (nd_it.get_l_child ().m_p_nd == > + 0) ? 0 : nd_it.get_l_child ().get_metadata (); > + nd_it.get_metadata () = l_max_endpoint; > + } > + void update_to_top (node_pointer p_nd) > + { > + while (p_nd != m_p_head) > + { > + apply_update (p_nd); > + p_nd = p_nd->m_p_parent; > + } > + } > + > + rb_tree_node_ * m_p_head; > + }; > +} > + > +int main () > +{ > + bin_search_tree_no_data_ ().insert_imp_empty (0); > + return 0; > +} > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c b/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c > index a9cc816..d88c4a2 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c > @@ -12,11 +12,18 @@ dct_unquantize_h263_inter_c (short *block, int n, int qscale, int nCoeffs) > for (i = 0; i <= nCoeffs; i++) > { > level = block[i]; > - if (level < 0) > - level = level * qmul - qadd; > - else > - level = level * qmul + qadd; > - block[i] = level; > + if (level) > + { > + if (level < 0) > + { > + level = level * qmul - qadd; > + } > + else > + { > + level = level * qmul + qadd; > + } > + block[i] = level; > + } > } > } > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c b/gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c > new file mode 100644 > index 0000000..d7cf279 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c > @@ -0,0 +1,29 @@ > +/* { dg-do compile } */ > +/* { dg-options "-c -O2 -ftree-vectorize" { target *-*-* } } */ > + > +typedef union tree_node *tree; > +struct tree_common > +{ > + unsigned volatile_flag : 1; > + unsigned unsigned_flag : 1; > +}; > +struct tree_type > +{ > + tree next_variant; > + tree main_variant; > +}; > +union tree_node > +{ > + struct tree_common common; > + struct tree_type type; > +}; > +void finish_enum (tree enumtype) > +{ > + tree tem; > + for (tem = ((enumtype)->type.main_variant); tem; tem = ((tem)->type.next_variant)) > + { > + if (tem == enumtype) > + continue; > + ((tem)->common.unsigned_flag) = ((enumtype)->common.unsigned_flag); > + } > +} > diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c > index 642dbda..9fc6190 100644 > --- a/gcc/tree-if-conv.c > +++ b/gcc/tree-if-conv.c > @@ -446,171 +446,47 @@ if_convertible_phi_p (struct loop *loop, basic_block bb, gimple phi) > return true; > } > > -/* Records the status of a data reference. This struct is attached to > - each DR->aux field. */ > - > -struct ifc_dr { > - /* -1 when not initialized, 0 when false, 1 when true. */ > - int written_at_least_once; > - > - /* -1 when not initialized, 0 when false, 1 when true. */ > - int rw_unconditionally; > -}; > - > -#define IFC_DR(DR) ((struct ifc_dr *) (DR)->aux) > -#define DR_WRITTEN_AT_LEAST_ONCE(DR) (IFC_DR (DR)->written_at_least_once) > -#define DR_RW_UNCONDITIONALLY(DR) (IFC_DR (DR)->rw_unconditionally) > - > -/* Returns true when the memory references of STMT are read or written > - unconditionally. In other words, this function returns true when > - for every data reference A in STMT there exist other accesses to > - the same data reference with predicates that add up (OR-up) to the > - true predicate: this ensures that the data reference A is touched > - (read or written) on every iteration of the if-converted loop. */ > - > -static bool > -memrefs_read_or_written_unconditionally (gimple stmt, > - VEC (data_reference_p, heap) *drs) > -{ > - int i, j; > - data_reference_p a, b; > - tree ca = bb_predicate (gimple_bb (stmt)); > - > - for (i = 0; VEC_iterate (data_reference_p, drs, i, a); i++) > - if (DR_STMT (a) == stmt) > - { > - bool found = false; > - int x = DR_RW_UNCONDITIONALLY (a); > - > - if (x == 0) > - return false; > - > - if (x == 1) > - continue; > - > - for (j = 0; VEC_iterate (data_reference_p, drs, j, b); j++) > - if (DR_STMT (b) != stmt > - && same_data_refs (a, b)) > - { > - tree cb = bb_predicate (gimple_bb (DR_STMT (b))); > - > - if (DR_RW_UNCONDITIONALLY (b) == 1 > - || is_true_predicate (cb) > - || is_true_predicate (ca = fold_or_predicates (EXPR_LOCATION (cb), > - ca, cb))) > - { > - DR_RW_UNCONDITIONALLY (a) = 1; > - DR_RW_UNCONDITIONALLY (b) = 1; > - found = true; > - break; > - } > - } > - > - if (!found) > - { > - DR_RW_UNCONDITIONALLY (a) = 0; > - return false; > - } > - } > - > - return true; > -} > - > -/* Returns true when the memory references of STMT are unconditionally > - written. In other words, this function returns true when for every > - data reference A written in STMT, there exist other writes to the > - same data reference with predicates that add up (OR-up) to the true > - predicate: this ensures that the data reference A is written on > - every iteration of the if-converted loop. */ > +/* Wrapper around gimple_could_trap_p refined for the needs of the > + if-conversion. */ > > static bool > -write_memrefs_written_at_least_once (gimple stmt, > - VEC (data_reference_p, heap) *drs) > +ifcvt_could_trap_p (gimple stmt) > { > - int i, j; > - data_reference_p a, b; > - tree ca = bb_predicate (gimple_bb (stmt)); > - > - for (i = 0; VEC_iterate (data_reference_p, drs, i, a); i++) > - if (DR_STMT (a) == stmt > - && DR_IS_WRITE (a)) > - { > - bool found = false; > - int x = DR_WRITTEN_AT_LEAST_ONCE (a); > - > - if (x == 0) > - return false; > - > - if (x == 1) > - continue; > - > - for (j = 0; VEC_iterate (data_reference_p, drs, j, b); j++) > - if (DR_STMT (b) != stmt > - && DR_IS_WRITE (b) > - && same_data_refs_base_objects (a, b)) > - { > - tree cb = bb_predicate (gimple_bb (DR_STMT (b))); > - > - if (DR_WRITTEN_AT_LEAST_ONCE (b) == 1 > - || is_true_predicate (cb) > - || is_true_predicate (ca = fold_or_predicates (EXPR_LOCATION (cb), > - ca, cb))) > - { > - DR_WRITTEN_AT_LEAST_ONCE (a) = 1; > - DR_WRITTEN_AT_LEAST_ONCE (b) = 1; > - found = true; > - break; > - } > - } > - > - if (!found) > - { > - DR_WRITTEN_AT_LEAST_ONCE (a) = 0; > - return false; > - } > - } > + if (gimple_vuse (stmt) > + && !gimple_could_trap_p_1 (stmt, false, false)) > + return false; > > - return true; > + return gimple_could_trap_p (stmt); > } > > -/* Return true when the memory references of STMT won't trap in the > - if-converted code. There are two things that we have to check for: > - > - - writes to memory occur to writable memory: if-conversion of > - memory writes transforms the conditional memory writes into > - unconditional writes, i.e. "if (cond) A[i] = foo" is transformed > - into "A[i] = cond ? foo : A[i]", and as the write to memory may not > - be executed at all in the original code, it may be a readonly > - memory. To check that A is not const-qualified, we check that > - there exists at least an unconditional write to A in the current > - function. > - > - - reads or writes to memory are valid memory accesses for every > - iteration. To check that the memory accesses are correctly formed > - and that we are allowed to read and write in these locations, we > - check that the memory accesses to be if-converted occur at every > - iteration unconditionally. */ > +/* Returns true when stmt contains a data reference. */ > > static bool > -ifcvt_memrefs_wont_trap (gimple stmt, VEC (data_reference_p, heap) *refs) > +has_unaligned_memory_refs (gimple stmt) Ick - unified diffs are hard to read (sometimes). The comment doesn't match the function name -- unaligned data reference or not? > { > - return write_memrefs_written_at_least_once (stmt, refs) > - && memrefs_read_or_written_unconditionally (stmt, refs); > -} > - > -/* Wrapper around gimple_could_trap_p refined for the needs of the > - if-conversion. Try to prove that the memory accesses of STMT could > - not trap in the innermost loop containing STMT. */ > + int unsignedp, volatilep; > + HOST_WIDE_INT bitsize, bitpos; > + tree toffset; > + enum machine_mode mode; > + VEC (data_ref_loc, heap) *refs = VEC_alloc (data_ref_loc, heap, 3); > + bool res = get_references_in_stmt (stmt, &refs); > + unsigned i; > + data_ref_loc *ref; > + > + FOR_EACH_VEC_ELT (data_ref_loc, refs, i, ref) > + { > + get_inner_reference (*ref->pos, &bitsize, &bitpos, &toffset, > + &mode, &unsignedp, &volatilep, true); > > -static bool > -ifcvt_could_trap_p (gimple stmt, VEC (data_reference_p, heap) *refs) > -{ > - if (gimple_vuse (stmt) > - && !gimple_could_trap_p_1 (stmt, false, false) > - && ifcvt_memrefs_wont_trap (stmt, refs)) > - return false; > + if ((bitpos % BITS_PER_UNIT) != 0) Hmm, that's not actually unaligned but not addressable, right? I guess you want to re-use ivopts may_be_nonaddressable_p instead. > + { > + res = true; > + break; > + } > + } > > - return gimple_could_trap_p (stmt); > + VEC_free (data_ref_loc, heap, refs); > + return res; > } > > /* Return true when STMT is if-convertible. > @@ -621,8 +497,7 @@ ifcvt_could_trap_p (gimple stmt, VEC (data_reference_p, heap) *refs) > - LHS is not var decl. */ > > static bool > -if_convertible_gimple_assign_stmt_p (gimple stmt, > - VEC (data_reference_p, heap) *refs) > +if_convertible_gimple_assign_stmt_p (gimple stmt) > { > tree lhs = gimple_assign_lhs (stmt); > basic_block bb; > @@ -650,12 +525,20 @@ if_convertible_gimple_assign_stmt_p (gimple stmt, > > if (flag_tree_loop_if_convert_stores) > { > - if (ifcvt_could_trap_p (stmt, refs)) > + if (ifcvt_could_trap_p (stmt)) > { > if (dump_file && (dump_flags & TDF_DETAILS)) > fprintf (dump_file, "tree could trap...\n"); > return false; > } > + > + if (has_unaligned_memory_refs (stmt)) > + { > + if (dump_file && (dump_flags & TDF_DETAILS)) > + fprintf (dump_file, "uses misaligned memory...\n"); But here it suggests misaligned again (why'd we care for misalignment?) > + return false; > + } > + > return true; > } > > @@ -690,7 +573,7 @@ if_convertible_gimple_assign_stmt_p (gimple stmt, > - it is a GIMPLE_LABEL or a GIMPLE_COND. */ > > static bool > -if_convertible_stmt_p (gimple stmt, VEC (data_reference_p, heap) *refs) > +if_convertible_stmt_p (gimple stmt) > { > switch (gimple_code (stmt)) > { > @@ -700,7 +583,7 @@ if_convertible_stmt_p (gimple stmt, VEC (data_reference_p, heap) *refs) > return true; > > case GIMPLE_ASSIGN: > - return if_convertible_gimple_assign_stmt_p (stmt, refs); > + return if_convertible_gimple_assign_stmt_p (stmt); > > default: > /* Don't know what to do with 'em so don't do anything. */ > @@ -1016,18 +899,6 @@ if_convertible_loop_p_1 (struct loop *loop, > if (!res) > return false; > > - if (flag_tree_loop_if_convert_stores) > - { > - data_reference_p dr; > - > - for (i = 0; VEC_iterate (data_reference_p, *refs, i, dr); i++) > - { > - dr->aux = XNEW (struct ifc_dr); > - DR_WRITTEN_AT_LEAST_ONCE (dr) = -1; > - DR_RW_UNCONDITIONALLY (dr) = -1; > - } > - } > - > for (i = 0; i < loop->num_nodes; i++) > { > basic_block bb = ifc_bbs[i]; > @@ -1040,7 +911,7 @@ if_convertible_loop_p_1 (struct loop *loop, > /* Check the if-convertibility of statements in predicated BBs. */ > if (is_predicated (bb)) > for (itr = gsi_start_bb (bb); !gsi_end_p (itr); gsi_next (&itr)) > - if (!if_convertible_stmt_p (gsi_stmt (itr), *refs)) > + if (!if_convertible_stmt_p (gsi_stmt (itr))) > return false; > } > > @@ -1101,15 +972,6 @@ if_convertible_loop_p (struct loop *loop) > ddrs = VEC_alloc (ddr_p, heap, 25); > res = if_convertible_loop_p_1 (loop, &refs, &ddrs); > > - if (flag_tree_loop_if_convert_stores) > - { > - data_reference_p dr; > - unsigned int i; > - > - for (i = 0; VEC_iterate (data_reference_p, refs, i, dr); i++) > - free (dr->aux); > - } > - > free_data_refs (refs); > free_dependence_relations (ddrs); > return res; > @@ -1366,6 +1228,78 @@ insert_gimplified_predicates (loop_p loop) > } > } > > +/* Insert at the beginning of the first basic block of the current > + function the allocation on the stack of N bytes of memory and > + return a pointer to this scratchpad memory. */ > + > +static tree > +create_scratchpad (void) > +{ > + basic_block bb = single_succ (ENTRY_BLOCK_PTR); > + gimple_stmt_iterator gsi = gsi_after_labels (bb); > + > + /* void *tmp = __builtin_alloca */ > + const char *name = "scratch_pad"; > + tree x = build_int_cst (integer_type_node, 64); > + gimple stmt = gimple_build_call (built_in_decls[BUILT_IN_ALLOCA], 1, x); > + tree var = create_tmp_var (ptr_type_node, name); > + tree tmp = make_ssa_name (var, stmt); It would be better to use an automatic variable than using alloca which is expensive. Why was your choice that way? (Are we ever if-converting aggregate stores? I hope not.) Also you are unconditionally allocating 64 bytes instead of N. Note that if you want to make vectorization happy you would need to ensure that for if (x) a[i] = ...; the scratchpad you'll end up using will have the _same_ alignment as a[i] (same or larger for all offsets). Using a local array of chars should make it possible for the vectorizer to adjust its alignment if needed. > + add_referenced_var (var); > + gimple_call_set_lhs (stmt, tmp); > + SSA_NAME_DEF_STMT (tmp) = stmt; > + update_stmt (stmt); > + > + gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); > + return tmp; > +} > + > +/* Returns a memory reference to the pointer defined by the > + conditional expression: pointer = cond ? &A[i] : scratch_pad; and > + inserts this code at GSI. */ > + > +static tree > +create_indirect_cond_expr (tree ai, tree cond, tree *scratch_pad, > + gimple_stmt_iterator *gsi) > +{ > + tree type = TREE_TYPE (ai); > + > + tree pointer_to_type, address_of_ai, addr_expr, cond_expr; > + tree pointer, star_pointer; > + gimple addr_stmt, pointer_stmt; > + > + /* address_of_ai = &A[i]; */ > + pointer_to_type = build_pointer_type (type); > + address_of_ai = create_tmp_var (pointer_to_type, "_ifc_"); Use create_tmp_reg (everywhere) > + add_referenced_var (address_of_ai); > + addr_expr = build_fold_addr_expr (ai); If you build that before create_tmp_reg you can use TREE_TYPE of it and avoid creating pointer_to_type. > + addr_stmt = gimple_build_assign (address_of_ai, addr_expr); > + address_of_ai = make_ssa_name (address_of_ai, addr_stmt); > + gimple_assign_set_lhs (addr_stmt, address_of_ai); > + SSA_NAME_DEF_STMT (address_of_ai) = addr_stmt; > + update_stmt (addr_stmt); > + gsi_insert_before (gsi, addr_stmt, GSI_SAME_STMT); > + > + /* Allocate the scratch pad only once per function. */ > + if (!*scratch_pad) > + *scratch_pad = create_scratchpad (); > + > + /* pointer = cond ? address_of_ai : scratch_pad; */ > + pointer = create_tmp_var (pointer_to_type, "_ifc_"); > + add_referenced_var (pointer); > + cond_expr = build3 (COND_EXPR, pointer_to_type, unshare_expr (cond), > + address_of_ai, *scratch_pad); > + pointer_stmt = gimple_build_assign (pointer, cond_expr); > + pointer = make_ssa_name (pointer, pointer_stmt); > + gimple_assign_set_lhs (pointer_stmt, pointer); > + SSA_NAME_DEF_STMT (pointer) = pointer_stmt; > + update_stmt (pointer_stmt); > + gsi_insert_before (gsi, pointer_stmt, GSI_SAME_STMT); > + > + star_pointer = build_simple_mem_ref (pointer); build2 (MEM_REF, TREE_TYPE (ai), pointer, build_int_cst (reference_alias_ptr_type (ai), 0)); as you need to preserve TBAA info. > + return star_pointer; > +} > + > /* Predicate each write to memory in LOOP. > > This function transforms control flow constructs containing memory > @@ -1377,10 +1311,19 @@ insert_gimplified_predicates (loop_p loop) > > into the following form that does not contain control flow: > > - | for (i = 0; i < N; i++) > - | A[i] = cond ? expr : A[i]; > + | void *scratch_pad = alloca (MAX_TYPE_SIZE_IN_BYTES); > + | > + | for (i = 0; i < N; i++) { > + | p = cond ? &A[i] : scratch_pad; > + | *p = expr; > + | } > + > + SCRATCH_PAD is allocated on the stack for each function once and it is > + large enough to contain any kind of scalar assignment or read. All > + values read or written to SCRATCH_PAD are not used in the computation. > > - The original CFG looks like this: > + In a more detailed way, the if-conversion of memory writes works > + like this, supposing that the original CFG looks like this: > > | bb_0 > | i = 0 > @@ -1430,10 +1373,12 @@ insert_gimplified_predicates (loop_p loop) > | goto bb_1 > | end_bb_4 > > - predicate_mem_writes is then predicating the memory write as follows: > + predicate_mem_writes is then allocating SCRATCH_PAD in the basic block > + preceding the loop header, and is predicating the memory write: > > | bb_0 > | i = 0 > + | scratch_pad = alloca (MAX_TYPE_SIZE_IN_BYTES); > | end_bb_0 > | > | bb_1 > @@ -1441,12 +1386,14 @@ insert_gimplified_predicates (loop_p loop) > | end_bb_1 > | > | bb_2 > + | cond = some_computation; > | if (cond) goto bb_3 else goto bb_4 > | end_bb_2 > | > | bb_3 > | cond = some_computation; > - | A[i] = cond ? expr : A[i]; > + | p = cond ? &A[i] : scratch_pad; > + | *p = expr; > | goto bb_4 > | end_bb_3 > | > @@ -1459,12 +1406,14 @@ insert_gimplified_predicates (loop_p loop) > > | bb_0 > | i = 0 > + | scratch_pad = alloca (MAX_TYPE_SIZE_IN_BYTES); > | if (i < N) goto bb_5 else goto bb_1 > | end_bb_0 > | > | bb_1 > | cond = some_computation; > - | A[i] = cond ? expr : A[i]; > + | p = cond ? &A[i] : scratch_pad; > + | *p = expr; > | if (i < N) goto bb_5 else goto bb_4 > | end_bb_1 > | > @@ -1474,7 +1423,7 @@ insert_gimplified_predicates (loop_p loop) > */ > > static void > -predicate_mem_writes (loop_p loop) > +predicate_mem_writes (loop_p loop, tree *scratch_pad) > { > unsigned int i, orig_loop_num_nodes = loop->num_nodes; > > @@ -1489,20 +1438,35 @@ predicate_mem_writes (loop_p loop) > continue; > > for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > - if ((stmt = gsi_stmt (gsi)) > - && gimple_assign_single_p (stmt) > - && gimple_vdef (stmt)) > - { > - tree lhs = gimple_assign_lhs (stmt); > - tree rhs = gimple_assign_rhs1 (stmt); > - tree type = TREE_TYPE (lhs); > - > - lhs = ifc_temp_var (type, unshare_expr (lhs), &gsi); > - rhs = ifc_temp_var (type, unshare_expr (rhs), &gsi); > - rhs = build3 (COND_EXPR, type, unshare_expr (cond), rhs, lhs); > - gimple_assign_set_rhs1 (stmt, ifc_temp_var (type, rhs, &gsi)); > - update_stmt (stmt); > - } > + { > + stmt = gsi_stmt (gsi); > + if (gimple_assign_single_p (stmt) > + && gimple_vdef (stmt)) > + { > + /* A[i] = x; */ > + tree ai = gimple_assign_lhs (stmt); > + > + /* pointer = cond ? &A[i] : scratch_pad; */ > + tree star_pointer = create_indirect_cond_expr (ai, cond, > + scratch_pad, &gsi); > + /* *pointer = x; */ > + gimple_assign_set_lhs (stmt, star_pointer); > + update_stmt (stmt); > + } > + else if (gimple_assign_single_p (stmt) > + && gimple_vuse (stmt)) > + { > + /* x = A[i]; */ > + tree ai = gimple_assign_rhs1 (stmt); > + > + /* pointer = cond ? &A[i] : scratch_pad; */ > + tree star_pointer = create_indirect_cond_expr (ai, cond, > + scratch_pad, &gsi); > + /* x = *pointer; */ > + gimple_assign_set_rhs1 (stmt, star_pointer); > + update_stmt (stmt); > + } > + } > } > } > > @@ -1552,7 +1516,7 @@ remove_conditions_and_labels (loop_p loop) > blocks. Replace PHI nodes with conditional modify expressions. */ > > static void > -combine_blocks (struct loop *loop) > +combine_blocks (struct loop *loop, tree *scratch_pad) > { > basic_block bb, exit_bb, merge_target_bb; > unsigned int orig_loop_num_nodes = loop->num_nodes; > @@ -1565,7 +1529,7 @@ combine_blocks (struct loop *loop) > predicate_all_scalar_phis (loop); > > if (flag_tree_loop_if_convert_stores) > - predicate_mem_writes (loop); > + predicate_mem_writes (loop, scratch_pad); > > /* Merge basic blocks: first remove all the edges in the loop, > except for those from the exit block. */ > @@ -1654,7 +1618,7 @@ combine_blocks (struct loop *loop) > profitability analysis. Returns true when something changed. */ > > static bool > -tree_if_conversion (struct loop *loop) > +tree_if_conversion (struct loop *loop, tree *scratch_pad) > { > bool changed = false; > ifc_bbs = NULL; > @@ -1666,7 +1630,7 @@ tree_if_conversion (struct loop *loop) > /* Now all statements are if-convertible. Combine all the basic > blocks into one huge basic block doing the if-conversion > on-the-fly. */ > - combine_blocks (loop); > + combine_blocks (loop, scratch_pad); > > if (flag_tree_loop_if_convert_stores) > mark_sym_for_renaming (gimple_vop (cfun)); > @@ -1697,12 +1661,13 @@ main_tree_if_conversion (void) > struct loop *loop; > bool changed = false; > unsigned todo = 0; > + tree scratch_pad = NULL_TREE; > > if (number_of_loops () <= 1) > return 0; > > FOR_EACH_LOOP (li, loop, 0) > - changed |= tree_if_conversion (loop); > + changed |= tree_if_conversion (loop, &scratch_pad); > > if (changed) > todo |= TODO_cleanup_cfg; Overall I like the new way much more. Please update and repost. Thanks, Richard.
On Fri, Nov 5, 2010 at 07:06, Richard Guenther <rguenther@suse.de> wrote: >> +/* Returns true when stmt contains a data reference. */ >> >> static bool >> -ifcvt_memrefs_wont_trap (gimple stmt, VEC (data_reference_p, heap) *refs) >> +has_unaligned_memory_refs (gimple stmt) > > Ick - unified diffs are hard to read (sometimes). The comment > doesn't match the function name -- unaligned data reference or not? > Right, the comment is incomplete. >> + if ((bitpos % BITS_PER_UNIT) != 0) > > Hmm, that's not actually unaligned but not addressable, right? > I guess you want to re-use ivopts may_be_nonaddressable_p instead. Correct. One of the testcases that I added triggered this error in expand, and I took part of this code from may_be_nonaddressable_p. I don't remember why I was not able to use may_be_nonaddressable_p, but I will try. >> + if (has_unaligned_memory_refs (stmt)) >> + { >> + if (dump_file && (dump_flags & TDF_DETAILS)) >> + fprintf (dump_file, "uses misaligned memory...\n"); > > But here it suggests misaligned again (why'd we care for misalignment?) > I will change this printf to nonaddressable memory. >> +/* Insert at the beginning of the first basic block of the current >> + function the allocation on the stack of N bytes of memory and >> + return a pointer to this scratchpad memory. */ >> + >> +static tree >> +create_scratchpad (void) >> +{ >> + basic_block bb = single_succ (ENTRY_BLOCK_PTR); >> + gimple_stmt_iterator gsi = gsi_after_labels (bb); >> + >> + /* void *tmp = __builtin_alloca */ >> + const char *name = "scratch_pad"; >> + tree x = build_int_cst (integer_type_node, 64); >> + gimple stmt = gimple_build_call (built_in_decls[BUILT_IN_ALLOCA], 1, x); >> + tree var = create_tmp_var (ptr_type_node, name); >> + tree tmp = make_ssa_name (var, stmt); > > It would be better to use an automatic variable than using alloca > which is expensive. Why was your choice that way? (Are we ever > if-converting aggregate stores? I hope not.) > > Also you are unconditionally allocating 64 bytes instead of N. > > Note that if you want to make vectorization happy you would need > to ensure that for > > if (x) > a[i] = ...; > > the scratchpad you'll end up using will have the _same_ alignment > as a[i] (same or larger for all offsets). Using a local array > of chars should make it possible for the vectorizer to adjust > its alignment if needed. > Ok, thanks for the recommendation, I will try to use a local array. Also, for the vectorization to happen, we should teach the vectorizer and the data dependence analysis that the reads/writes to the scratchpad have inconsequential effects. The vectorizer should also be able to use extra scratchpad memory to transform the scratchpaded dataref into a vector in the scratchpad. > > Overall I like the new way much more. Please update and repost. Thanks for the review. I will send an updated patch. Sebastian
Hi Richi, On Fri, Nov 5, 2010 at 11:08, Sebastian Pop <sebpop@gmail.com> wrote: >> Note that if you want to make vectorization happy you would need >> to ensure that for >> >> if (x) >> a[i] = ...; >> >> the scratchpad you'll end up using will have the _same_ alignment >> as a[i] (same or larger for all offsets). Using a local array >> of chars should make it possible for the vectorizer to adjust >> its alignment if needed. >> > > Ok, thanks for the recommendation, I will try to use a local array. Do you happen to know how to declare an automatic variable? I was looking at the code in tree-switch-conversion.c:build_one_array. Does that look the right approach to statically declare the scratchpad on the stack? Thanks, Sebastian
On Wed, 10 Nov 2010, Sebastian Pop wrote: > Hi Richi, > > On Fri, Nov 5, 2010 at 11:08, Sebastian Pop <sebpop@gmail.com> wrote: > >> Note that if you want to make vectorization happy you would need > >> to ensure that for > >> > >> if (x) > >> a[i] = ...; > >> > >> the scratchpad you'll end up using will have the _same_ alignment > >> as a[i] (same or larger for all offsets). Using a local array > >> of chars should make it possible for the vectorizer to adjust > >> its alignment if needed. > >> > > > > Ok, thanks for the recommendation, I will try to use a local array. > > Do you happen to know how to declare an automatic variable? > > I was looking at the code in tree-switch-conversion.c:build_one_array. > Does that look the right approach to statically declare the scratchpad > on the stack? As simple as var = create_tmp_var (type, NULL_TREE); switch-conversion builds a local static array, not an automatic var. Richard.
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index beed454..0f58882 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,31 @@ +2010-10-20 Sebastian Pop <sebastian.pop@amd.com> + + PR tree-optimization/46029 + * doc/invoke.texi (-ftree-loop-if-convert-stores): Update description. + * tree-if-conv.c (has_unaligned_memory_refs): New. + (if_convertible_gimple_assign_stmt_p): Call has_unaligned_memory_refs. + (create_scratchpad): New. + (create_indirect_cond_expr): New. + (predicate_mem_writes): Call create_indirect_cond_expr. Take an extra + parameter for scratch_pad. + (combine_blocks): Same. + (tree_if_conversion): Same. + (main_tree_if_conversion): Pass to tree_if_conversion a pointer to + scratch_pad. + (struct ifc_dr): Removed. + (IFC_DR): Removed. + (DR_WRITTEN_AT_LEAST_ONCE): Removed. + (DR_RW_UNCONDITIONALLY): Removed. + (memrefs_read_or_written_unconditionally): Removed. + (write_memrefs_written_at_least_once): Removed. + (ifcvt_memrefs_wont_trap): Removed. + (ifcvt_could_trap_p): Does not take refs parameter anymore. + (if_convertible_gimple_assign_stmt_p): Same. + (if_convertible_stmt_p): Same. + (if_convertible_loop_p_1): Remove initialization of dr->aux, + DR_WRITTEN_AT_LEAST_ONCE, and DR_RW_UNCONDITIONALLY. + (if_convertible_loop_p): Remove deallocation of the same. + 2010-10-20 Nathan Froyd <froydnj@codesourcery.com> * ifcvt.c (noce_emit_cmove): If both of the values are SUBREGs, try diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index ee68454..28b0cbb 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -6935,20 +6935,26 @@ if vectorization is enabled. @item -ftree-loop-if-convert-stores Attempt to also if-convert conditional jumps containing memory writes. -This transformation can be unsafe for multi-threaded programs as it -transforms conditional memory writes into unconditional memory writes. For example, @smallexample for (i = 0; i < N; i++) if (cond) - A[i] = expr; + A[i] = B[i] + 2; @end smallexample would be transformed to @smallexample -for (i = 0; i < N; i++) - A[i] = cond ? expr : A[i]; +void *scratchpad = alloca (64); +for (i = 0; i < N; i++) @{ + a = cond ? &A[i] : scratchpad; + b = cond ? &B[i] : scratchpad; + *a = *b + 2; +@} @end smallexample -potentially producing data races. +The compiler allocates a scratchpad memory on the stack for each +function in which the if-conversion of memory stores or reads +happened. This scratchpad memory is used during the part of the +computation that is discarded, i.e., when the condition is evaluated +to false. @item -ftree-loop-distribution Perform loop distribution. This flag can improve cache performance on diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 9d9c543..4233f86 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2010-10-20 Sebastian Pop <sebastian.pop@amd.com> + + PR tree-optimization/46029 + * g++.dg/tree-ssa/ifc-pr46029.C: New. + * gcc.dg/tree-ssa/ifc-8.c: New. + * gcc.dg/tree-ssa/ifc-5.c: Make it exactly like the FFmpeg kernel. + 2010-10-20 Rainer Orth <ro@CeBiTec.Uni-Bielefeld.DE> PR c++/46024 diff --git a/gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C b/gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C new file mode 100644 index 0000000..2a54bdb --- /dev/null +++ b/gcc/testsuite/g++.dg/tree-ssa/ifc-pr46029.C @@ -0,0 +1,76 @@ +// { dg-do run } +/* { dg-options "-O -ftree-loop-if-convert-stores" } */ + +namespace +{ + struct rb_tree_node_ + { + rb_tree_node_ ():m_p_left (0), m_p_parent (0), m_metadata (0) + { + } + unsigned &get_metadata () + { + return m_metadata; + } + rb_tree_node_ *m_p_left; + rb_tree_node_ *m_p_parent; + unsigned m_metadata; + }; + + struct bin_search_tree_const_node_it_ + { + bin_search_tree_const_node_it_ (rb_tree_node_ * p_nd):m_p_nd (p_nd) + { + } + unsigned &get_metadata () + { + return m_p_nd->get_metadata (); + } + bin_search_tree_const_node_it_ get_l_child () + { + return bin_search_tree_const_node_it_ (m_p_nd->m_p_left); + } + + rb_tree_node_ *m_p_nd; + }; + + struct bin_search_tree_no_data_ + { + typedef rb_tree_node_ *node_pointer; + bin_search_tree_no_data_ ():m_p_head (new rb_tree_node_ ()) + { + } + void insert_imp_empty (int r_value) + { + rb_tree_node_ *p_new_node = new rb_tree_node_ (); + m_p_head->m_p_parent = p_new_node; + p_new_node->m_p_parent = m_p_head; + update_to_top (m_p_head->m_p_parent); + } + void apply_update (bin_search_tree_const_node_it_ nd_it) + { + unsigned + l_max_endpoint + = + (nd_it.get_l_child ().m_p_nd == + 0) ? 0 : nd_it.get_l_child ().get_metadata (); + nd_it.get_metadata () = l_max_endpoint; + } + void update_to_top (node_pointer p_nd) + { + while (p_nd != m_p_head) + { + apply_update (p_nd); + p_nd = p_nd->m_p_parent; + } + } + + rb_tree_node_ * m_p_head; + }; +} + +int main () +{ + bin_search_tree_no_data_ ().insert_imp_empty (0); + return 0; +} diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c b/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c index a9cc816..d88c4a2 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-5.c @@ -12,11 +12,18 @@ dct_unquantize_h263_inter_c (short *block, int n, int qscale, int nCoeffs) for (i = 0; i <= nCoeffs; i++) { level = block[i]; - if (level < 0) - level = level * qmul - qadd; - else - level = level * qmul + qadd; - block[i] = level; + if (level) + { + if (level < 0) + { + level = level * qmul - qadd; + } + else + { + level = level * qmul + qadd; + } + block[i] = level; + } } } diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c b/gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c new file mode 100644 index 0000000..d7cf279 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-8.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options "-c -O2 -ftree-vectorize" { target *-*-* } } */ + +typedef union tree_node *tree; +struct tree_common +{ + unsigned volatile_flag : 1; + unsigned unsigned_flag : 1; +}; +struct tree_type +{ + tree next_variant; + tree main_variant; +}; +union tree_node +{ + struct tree_common common; + struct tree_type type; +}; +void finish_enum (tree enumtype) +{ + tree tem; + for (tem = ((enumtype)->type.main_variant); tem; tem = ((tem)->type.next_variant)) + { + if (tem == enumtype) + continue; + ((tem)->common.unsigned_flag) = ((enumtype)->common.unsigned_flag); + } +} diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c index 642dbda..9fc6190 100644 --- a/gcc/tree-if-conv.c +++ b/gcc/tree-if-conv.c @@ -446,171 +446,47 @@ if_convertible_phi_p (struct loop *loop, basic_block bb, gimple phi) return true; } -/* Records the status of a data reference. This struct is attached to - each DR->aux field. */ - -struct ifc_dr { - /* -1 when not initialized, 0 when false, 1 when true. */ - int written_at_least_once; - - /* -1 when not initialized, 0 when false, 1 when true. */ - int rw_unconditionally; -}; - -#define IFC_DR(DR) ((struct ifc_dr *) (DR)->aux) -#define DR_WRITTEN_AT_LEAST_ONCE(DR) (IFC_DR (DR)->written_at_least_once) -#define DR_RW_UNCONDITIONALLY(DR) (IFC_DR (DR)->rw_unconditionally) - -/* Returns true when the memory references of STMT are read or written - unconditionally. In other words, this function returns true when - for every data reference A in STMT there exist other accesses to - the same data reference with predicates that add up (OR-up) to the - true predicate: this ensures that the data reference A is touched - (read or written) on every iteration of the if-converted loop. */ - -static bool -memrefs_read_or_written_unconditionally (gimple stmt, - VEC (data_reference_p, heap) *drs) -{ - int i, j; - data_reference_p a, b; - tree ca = bb_predicate (gimple_bb (stmt)); - - for (i = 0; VEC_iterate (data_reference_p, drs, i, a); i++) - if (DR_STMT (a) == stmt) - { - bool found = false; - int x = DR_RW_UNCONDITIONALLY (a); - - if (x == 0) - return false; - - if (x == 1) - continue; - - for (j = 0; VEC_iterate (data_reference_p, drs, j, b); j++) - if (DR_STMT (b) != stmt - && same_data_refs (a, b)) - { - tree cb = bb_predicate (gimple_bb (DR_STMT (b))); - - if (DR_RW_UNCONDITIONALLY (b) == 1 - || is_true_predicate (cb) - || is_true_predicate (ca = fold_or_predicates (EXPR_LOCATION (cb), - ca, cb))) - { - DR_RW_UNCONDITIONALLY (a) = 1; - DR_RW_UNCONDITIONALLY (b) = 1; - found = true; - break; - } - } - - if (!found) - { - DR_RW_UNCONDITIONALLY (a) = 0; - return false; - } - } - - return true; -} - -/* Returns true when the memory references of STMT are unconditionally - written. In other words, this function returns true when for every - data reference A written in STMT, there exist other writes to the - same data reference with predicates that add up (OR-up) to the true - predicate: this ensures that the data reference A is written on - every iteration of the if-converted loop. */ +/* Wrapper around gimple_could_trap_p refined for the needs of the + if-conversion. */ static bool -write_memrefs_written_at_least_once (gimple stmt, - VEC (data_reference_p, heap) *drs) +ifcvt_could_trap_p (gimple stmt) { - int i, j; - data_reference_p a, b; - tree ca = bb_predicate (gimple_bb (stmt)); - - for (i = 0; VEC_iterate (data_reference_p, drs, i, a); i++) - if (DR_STMT (a) == stmt - && DR_IS_WRITE (a)) - { - bool found = false; - int x = DR_WRITTEN_AT_LEAST_ONCE (a); - - if (x == 0) - return false; - - if (x == 1) - continue; - - for (j = 0; VEC_iterate (data_reference_p, drs, j, b); j++) - if (DR_STMT (b) != stmt - && DR_IS_WRITE (b) - && same_data_refs_base_objects (a, b)) - { - tree cb = bb_predicate (gimple_bb (DR_STMT (b))); - - if (DR_WRITTEN_AT_LEAST_ONCE (b) == 1 - || is_true_predicate (cb) - || is_true_predicate (ca = fold_or_predicates (EXPR_LOCATION (cb), - ca, cb))) - { - DR_WRITTEN_AT_LEAST_ONCE (a) = 1; - DR_WRITTEN_AT_LEAST_ONCE (b) = 1; - found = true; - break; - } - } - - if (!found) - { - DR_WRITTEN_AT_LEAST_ONCE (a) = 0; - return false; - } - } + if (gimple_vuse (stmt) + && !gimple_could_trap_p_1 (stmt, false, false)) + return false; - return true; + return gimple_could_trap_p (stmt); } -/* Return true when the memory references of STMT won't trap in the - if-converted code. There are two things that we have to check for: - - - writes to memory occur to writable memory: if-conversion of - memory writes transforms the conditional memory writes into - unconditional writes, i.e. "if (cond) A[i] = foo" is transformed - into "A[i] = cond ? foo : A[i]", and as the write to memory may not - be executed at all in the original code, it may be a readonly - memory. To check that A is not const-qualified, we check that - there exists at least an unconditional write to A in the current - function. - - - reads or writes to memory are valid memory accesses for every - iteration. To check that the memory accesses are correctly formed - and that we are allowed to read and write in these locations, we - check that the memory accesses to be if-converted occur at every - iteration unconditionally. */ +/* Returns true when stmt contains a data reference. */ static bool -ifcvt_memrefs_wont_trap (gimple stmt, VEC (data_reference_p, heap) *refs) +has_unaligned_memory_refs (gimple stmt) { - return write_memrefs_written_at_least_once (stmt, refs) - && memrefs_read_or_written_unconditionally (stmt, refs); -} - -/* Wrapper around gimple_could_trap_p refined for the needs of the - if-conversion. Try to prove that the memory accesses of STMT could - not trap in the innermost loop containing STMT. */ + int unsignedp, volatilep; + HOST_WIDE_INT bitsize, bitpos; + tree toffset; + enum machine_mode mode; + VEC (data_ref_loc, heap) *refs = VEC_alloc (data_ref_loc, heap, 3); + bool res = get_references_in_stmt (stmt, &refs); + unsigned i; + data_ref_loc *ref; + + FOR_EACH_VEC_ELT (data_ref_loc, refs, i, ref) + { + get_inner_reference (*ref->pos, &bitsize, &bitpos, &toffset, + &mode, &unsignedp, &volatilep, true); -static bool -ifcvt_could_trap_p (gimple stmt, VEC (data_reference_p, heap) *refs) -{ - if (gimple_vuse (stmt) - && !gimple_could_trap_p_1 (stmt, false, false) - && ifcvt_memrefs_wont_trap (stmt, refs)) - return false; + if ((bitpos % BITS_PER_UNIT) != 0) + { + res = true; + break; + } + } - return gimple_could_trap_p (stmt); + VEC_free (data_ref_loc, heap, refs); + return res; } /* Return true when STMT is if-convertible. @@ -621,8 +497,7 @@ ifcvt_could_trap_p (gimple stmt, VEC (data_reference_p, heap) *refs) - LHS is not var decl. */ static bool -if_convertible_gimple_assign_stmt_p (gimple stmt, - VEC (data_reference_p, heap) *refs) +if_convertible_gimple_assign_stmt_p (gimple stmt) { tree lhs = gimple_assign_lhs (stmt); basic_block bb; @@ -650,12 +525,20 @@ if_convertible_gimple_assign_stmt_p (gimple stmt, if (flag_tree_loop_if_convert_stores) { - if (ifcvt_could_trap_p (stmt, refs)) + if (ifcvt_could_trap_p (stmt)) { if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "tree could trap...\n"); return false; } + + if (has_unaligned_memory_refs (stmt)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "uses misaligned memory...\n"); + return false; + } + return true; } @@ -690,7 +573,7 @@ if_convertible_gimple_assign_stmt_p (gimple stmt, - it is a GIMPLE_LABEL or a GIMPLE_COND. */ static bool -if_convertible_stmt_p (gimple stmt, VEC (data_reference_p, heap) *refs) +if_convertible_stmt_p (gimple stmt) { switch (gimple_code (stmt)) { @@ -700,7 +583,7 @@ if_convertible_stmt_p (gimple stmt, VEC (data_reference_p, heap) *refs) return true; case GIMPLE_ASSIGN: - return if_convertible_gimple_assign_stmt_p (stmt, refs); + return if_convertible_gimple_assign_stmt_p (stmt); default: /* Don't know what to do with 'em so don't do anything. */ @@ -1016,18 +899,6 @@ if_convertible_loop_p_1 (struct loop *loop, if (!res) return false; - if (flag_tree_loop_if_convert_stores) - { - data_reference_p dr; - - for (i = 0; VEC_iterate (data_reference_p, *refs, i, dr); i++) - { - dr->aux = XNEW (struct ifc_dr); - DR_WRITTEN_AT_LEAST_ONCE (dr) = -1; - DR_RW_UNCONDITIONALLY (dr) = -1; - } - } - for (i = 0; i < loop->num_nodes; i++) { basic_block bb = ifc_bbs[i]; @@ -1040,7 +911,7 @@ if_convertible_loop_p_1 (struct loop *loop, /* Check the if-convertibility of statements in predicated BBs. */ if (is_predicated (bb)) for (itr = gsi_start_bb (bb); !gsi_end_p (itr); gsi_next (&itr)) - if (!if_convertible_stmt_p (gsi_stmt (itr), *refs)) + if (!if_convertible_stmt_p (gsi_stmt (itr))) return false; } @@ -1101,15 +972,6 @@ if_convertible_loop_p (struct loop *loop) ddrs = VEC_alloc (ddr_p, heap, 25); res = if_convertible_loop_p_1 (loop, &refs, &ddrs); - if (flag_tree_loop_if_convert_stores) - { - data_reference_p dr; - unsigned int i; - - for (i = 0; VEC_iterate (data_reference_p, refs, i, dr); i++) - free (dr->aux); - } - free_data_refs (refs); free_dependence_relations (ddrs); return res; @@ -1366,6 +1228,78 @@ insert_gimplified_predicates (loop_p loop) } } +/* Insert at the beginning of the first basic block of the current + function the allocation on the stack of N bytes of memory and + return a pointer to this scratchpad memory. */ + +static tree +create_scratchpad (void) +{ + basic_block bb = single_succ (ENTRY_BLOCK_PTR); + gimple_stmt_iterator gsi = gsi_after_labels (bb); + + /* void *tmp = __builtin_alloca */ + const char *name = "scratch_pad"; + tree x = build_int_cst (integer_type_node, 64); + gimple stmt = gimple_build_call (built_in_decls[BUILT_IN_ALLOCA], 1, x); + tree var = create_tmp_var (ptr_type_node, name); + tree tmp = make_ssa_name (var, stmt); + + add_referenced_var (var); + gimple_call_set_lhs (stmt, tmp); + SSA_NAME_DEF_STMT (tmp) = stmt; + update_stmt (stmt); + + gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); + return tmp; +} + +/* Returns a memory reference to the pointer defined by the + conditional expression: pointer = cond ? &A[i] : scratch_pad; and + inserts this code at GSI. */ + +static tree +create_indirect_cond_expr (tree ai, tree cond, tree *scratch_pad, + gimple_stmt_iterator *gsi) +{ + tree type = TREE_TYPE (ai); + + tree pointer_to_type, address_of_ai, addr_expr, cond_expr; + tree pointer, star_pointer; + gimple addr_stmt, pointer_stmt; + + /* address_of_ai = &A[i]; */ + pointer_to_type = build_pointer_type (type); + address_of_ai = create_tmp_var (pointer_to_type, "_ifc_"); + add_referenced_var (address_of_ai); + addr_expr = build_fold_addr_expr (ai); + addr_stmt = gimple_build_assign (address_of_ai, addr_expr); + address_of_ai = make_ssa_name (address_of_ai, addr_stmt); + gimple_assign_set_lhs (addr_stmt, address_of_ai); + SSA_NAME_DEF_STMT (address_of_ai) = addr_stmt; + update_stmt (addr_stmt); + gsi_insert_before (gsi, addr_stmt, GSI_SAME_STMT); + + /* Allocate the scratch pad only once per function. */ + if (!*scratch_pad) + *scratch_pad = create_scratchpad (); + + /* pointer = cond ? address_of_ai : scratch_pad; */ + pointer = create_tmp_var (pointer_to_type, "_ifc_"); + add_referenced_var (pointer); + cond_expr = build3 (COND_EXPR, pointer_to_type, unshare_expr (cond), + address_of_ai, *scratch_pad); + pointer_stmt = gimple_build_assign (pointer, cond_expr); + pointer = make_ssa_name (pointer, pointer_stmt); + gimple_assign_set_lhs (pointer_stmt, pointer); + SSA_NAME_DEF_STMT (pointer) = pointer_stmt; + update_stmt (pointer_stmt); + gsi_insert_before (gsi, pointer_stmt, GSI_SAME_STMT); + + star_pointer = build_simple_mem_ref (pointer); + return star_pointer; +} + /* Predicate each write to memory in LOOP. This function transforms control flow constructs containing memory @@ -1377,10 +1311,19 @@ insert_gimplified_predicates (loop_p loop) into the following form that does not contain control flow: - | for (i = 0; i < N; i++) - | A[i] = cond ? expr : A[i]; + | void *scratch_pad = alloca (MAX_TYPE_SIZE_IN_BYTES); + | + | for (i = 0; i < N; i++) { + | p = cond ? &A[i] : scratch_pad; + | *p = expr; + | } + + SCRATCH_PAD is allocated on the stack for each function once and it is + large enough to contain any kind of scalar assignment or read. All + values read or written to SCRATCH_PAD are not used in the computation. - The original CFG looks like this: + In a more detailed way, the if-conversion of memory writes works + like this, supposing that the original CFG looks like this: | bb_0 | i = 0 @@ -1430,10 +1373,12 @@ insert_gimplified_predicates (loop_p loop) | goto bb_1 | end_bb_4 - predicate_mem_writes is then predicating the memory write as follows: + predicate_mem_writes is then allocating SCRATCH_PAD in the basic block + preceding the loop header, and is predicating the memory write: | bb_0 | i = 0 + | scratch_pad = alloca (MAX_TYPE_SIZE_IN_BYTES); | end_bb_0 | | bb_1 @@ -1441,12 +1386,14 @@ insert_gimplified_predicates (loop_p loop) | end_bb_1 | | bb_2 + | cond = some_computation; | if (cond) goto bb_3 else goto bb_4 | end_bb_2 | | bb_3 | cond = some_computation; - | A[i] = cond ? expr : A[i]; + | p = cond ? &A[i] : scratch_pad; + | *p = expr; | goto bb_4 | end_bb_3 | @@ -1459,12 +1406,14 @@ insert_gimplified_predicates (loop_p loop) | bb_0 | i = 0 + | scratch_pad = alloca (MAX_TYPE_SIZE_IN_BYTES); | if (i < N) goto bb_5 else goto bb_1 | end_bb_0 | | bb_1 | cond = some_computation; - | A[i] = cond ? expr : A[i]; + | p = cond ? &A[i] : scratch_pad; + | *p = expr; | if (i < N) goto bb_5 else goto bb_4 | end_bb_1 | @@ -1474,7 +1423,7 @@ insert_gimplified_predicates (loop_p loop) */ static void -predicate_mem_writes (loop_p loop) +predicate_mem_writes (loop_p loop, tree *scratch_pad) { unsigned int i, orig_loop_num_nodes = loop->num_nodes; @@ -1489,20 +1438,35 @@ predicate_mem_writes (loop_p loop) continue; for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) - if ((stmt = gsi_stmt (gsi)) - && gimple_assign_single_p (stmt) - && gimple_vdef (stmt)) - { - tree lhs = gimple_assign_lhs (stmt); - tree rhs = gimple_assign_rhs1 (stmt); - tree type = TREE_TYPE (lhs); - - lhs = ifc_temp_var (type, unshare_expr (lhs), &gsi); - rhs = ifc_temp_var (type, unshare_expr (rhs), &gsi); - rhs = build3 (COND_EXPR, type, unshare_expr (cond), rhs, lhs); - gimple_assign_set_rhs1 (stmt, ifc_temp_var (type, rhs, &gsi)); - update_stmt (stmt); - } + { + stmt = gsi_stmt (gsi); + if (gimple_assign_single_p (stmt) + && gimple_vdef (stmt)) + { + /* A[i] = x; */ + tree ai = gimple_assign_lhs (stmt); + + /* pointer = cond ? &A[i] : scratch_pad; */ + tree star_pointer = create_indirect_cond_expr (ai, cond, + scratch_pad, &gsi); + /* *pointer = x; */ + gimple_assign_set_lhs (stmt, star_pointer); + update_stmt (stmt); + } + else if (gimple_assign_single_p (stmt) + && gimple_vuse (stmt)) + { + /* x = A[i]; */ + tree ai = gimple_assign_rhs1 (stmt); + + /* pointer = cond ? &A[i] : scratch_pad; */ + tree star_pointer = create_indirect_cond_expr (ai, cond, + scratch_pad, &gsi); + /* x = *pointer; */ + gimple_assign_set_rhs1 (stmt, star_pointer); + update_stmt (stmt); + } + } } } @@ -1552,7 +1516,7 @@ remove_conditions_and_labels (loop_p loop) blocks. Replace PHI nodes with conditional modify expressions. */ static void -combine_blocks (struct loop *loop) +combine_blocks (struct loop *loop, tree *scratch_pad) { basic_block bb, exit_bb, merge_target_bb; unsigned int orig_loop_num_nodes = loop->num_nodes; @@ -1565,7 +1529,7 @@ combine_blocks (struct loop *loop) predicate_all_scalar_phis (loop); if (flag_tree_loop_if_convert_stores) - predicate_mem_writes (loop); + predicate_mem_writes (loop, scratch_pad); /* Merge basic blocks: first remove all the edges in the loop, except for those from the exit block. */ @@ -1654,7 +1618,7 @@ combine_blocks (struct loop *loop) profitability analysis. Returns true when something changed. */ static bool -tree_if_conversion (struct loop *loop) +tree_if_conversion (struct loop *loop, tree *scratch_pad) { bool changed = false; ifc_bbs = NULL; @@ -1666,7 +1630,7 @@ tree_if_conversion (struct loop *loop) /* Now all statements are if-convertible. Combine all the basic blocks into one huge basic block doing the if-conversion on-the-fly. */ - combine_blocks (loop); + combine_blocks (loop, scratch_pad); if (flag_tree_loop_if_convert_stores) mark_sym_for_renaming (gimple_vop (cfun)); @@ -1697,12 +1661,13 @@ main_tree_if_conversion (void) struct loop *loop; bool changed = false; unsigned todo = 0; + tree scratch_pad = NULL_TREE; if (number_of_loops () <= 1) return 0; FOR_EACH_LOOP (li, loop, 0) - changed |= tree_if_conversion (loop); + changed |= tree_if_conversion (loop, &scratch_pad); if (changed) todo |= TODO_cleanup_cfg;