diff mbox

[1/4] Add flag -ftree-loop-if-convert-memory-writes.

Message ID 1278625285-12667-2-git-send-email-sebpop@gmail.com
State New
Headers show

Commit Message

Sebastian Pop July 8, 2010, 9:41 p.m. UTC
This patch adds a flag that controls the replacement of the memory
writes that are in predicated basic blocks with a full write:

for (...)
  if (cond)
    A[i] = foo

is replaced with:

for (...)
  A[i] = cond ? foo : A[i]

In order to do this, we have to call gimple_could_trap_p instead of
gimple_assign_rhs_could_trap_p, as we have to also check that the LHS
of assign stmts does not trap.

	* common.opt (ftree-loop-if-convert-memory-writes): New flag.
	* doc/invoke.texi (ftree-loop-if-convert-memory-writes): Documented.
	* tree-if-conv.c (if_convertible_phi_p): Allow virtual phi nodes when
	flag_loop_if_convert_memory_writes is set.
	(if_convertible_gimple_assign_stmt_p): Allow memory reads and writes
	Do not handle types that do not match is_gimple_reg_type.
	Remove loop and bb parameters.  Call gimple_could_trap_p instead of
	when flag_loop_if_convert_memory_writes	is set, as LHS can contain
	memory refs.
	(if_convertible_stmt_p): Remove loop and bb parameters.  Update calls
	to if_convertible_gimple_assign_stmt_p.
	(if_convertible_loop_p): Update call to if_convertible_stmt_p.
	(gimplify_condition): New.
	(replace_phi_with_cond_gimple_assign_stmt): Renamed
	predicate_scalar_phi.  Do not handle virtual phi nodes.
	(ifconvert_phi_nodes): Renamed predicate_all_scalar_phis.
	Call predicate_scalar_phi.
	(insert_gimplified_predicates): Insert the gimplified predicate of a BB
	just after the labels for flag_loop_if_convert_memory_writes, otherwise
	insert the predicate in the end of the BB.
	(predicate_mem_writes): New.
	(combine_blocks): Call predicate_all_scalar_phis.  When
	flag_loop_if_convert_memory_writes is set, call predicate_mem_writes.
	(tree_if_conversion): Call mark_sym_for_renaming when
	flag_loop_if_convert_memory_writes is set.
	(main_tree_if_conversion): Call update_ssa when
	flag_loop_if_convert_memory_writes is set.

	* gcc.dg/tree-ssa/ifc-4.c: New.
	* gcc.dg/tree-ssa/ifc-7.c: New.
---
 gcc/common.opt                        |    4 +
 gcc/doc/invoke.texi                   |   20 ++++-
 gcc/testsuite/gcc.dg/tree-ssa/ifc-4.c |   53 ++++++++++
 gcc/testsuite/gcc.dg/tree-ssa/ifc-7.c |   29 +++++
 gcc/tree-if-conv.c                    |  181 ++++++++++++++++++++++++++-------
 5 files changed, 247 insertions(+), 40 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifc-4.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifc-7.c

Comments

Richard Biener Aug. 13, 2010, 8:54 a.m. UTC | #1
On Thu, 8 Jul 2010, Sebastian Pop wrote:

> This patch adds a flag that controls the replacement of the memory
> writes that are in predicated basic blocks with a full write:
> 
> for (...)
>   if (cond)
>     A[i] = foo
> 
> is replaced with:
> 
> for (...)
>   A[i] = cond ? foo : A[i]
> 
> In order to do this, we have to call gimple_could_trap_p instead of
> gimple_assign_rhs_could_trap_p, as we have to also check that the LHS
> of assign stmts does not trap.

Comments below.

> 	* common.opt (ftree-loop-if-convert-memory-writes): New flag.
> 	* doc/invoke.texi (ftree-loop-if-convert-memory-writes): Documented.
> 	* tree-if-conv.c (if_convertible_phi_p): Allow virtual phi nodes when
> 	flag_loop_if_convert_memory_writes is set.
> 	(if_convertible_gimple_assign_stmt_p): Allow memory reads and writes
> 	Do not handle types that do not match is_gimple_reg_type.
> 	Remove loop and bb parameters.  Call gimple_could_trap_p instead of
> 	when flag_loop_if_convert_memory_writes	is set, as LHS can contain
> 	memory refs.
> 	(if_convertible_stmt_p): Remove loop and bb parameters.  Update calls
> 	to if_convertible_gimple_assign_stmt_p.
> 	(if_convertible_loop_p): Update call to if_convertible_stmt_p.
> 	(gimplify_condition): New.
> 	(replace_phi_with_cond_gimple_assign_stmt): Renamed
> 	predicate_scalar_phi.  Do not handle virtual phi nodes.
> 	(ifconvert_phi_nodes): Renamed predicate_all_scalar_phis.
> 	Call predicate_scalar_phi.
> 	(insert_gimplified_predicates): Insert the gimplified predicate of a BB
> 	just after the labels for flag_loop_if_convert_memory_writes, otherwise
> 	insert the predicate in the end of the BB.
> 	(predicate_mem_writes): New.
> 	(combine_blocks): Call predicate_all_scalar_phis.  When
> 	flag_loop_if_convert_memory_writes is set, call predicate_mem_writes.
> 	(tree_if_conversion): Call mark_sym_for_renaming when
> 	flag_loop_if_convert_memory_writes is set.
> 	(main_tree_if_conversion): Call update_ssa when
> 	flag_loop_if_convert_memory_writes is set.
> 
> 	* gcc.dg/tree-ssa/ifc-4.c: New.
> 	* gcc.dg/tree-ssa/ifc-7.c: New.
> ---
>  gcc/common.opt                        |    4 +
>  gcc/doc/invoke.texi                   |   20 ++++-
>  gcc/testsuite/gcc.dg/tree-ssa/ifc-4.c |   53 ++++++++++
>  gcc/testsuite/gcc.dg/tree-ssa/ifc-7.c |   29 +++++
>  gcc/tree-if-conv.c                    |  181 ++++++++++++++++++++++++++-------
>  5 files changed, 247 insertions(+), 40 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifc-4.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ifc-7.c
> 
> diff --git a/gcc/common.opt b/gcc/common.opt
> index 111d7b7..ceb94f8 100644
> --- a/gcc/common.opt
> +++ b/gcc/common.opt
> @@ -657,6 +657,10 @@ ftree-loop-if-convert
>  Common Report Var(flag_tree_loop_if_convert) Init(-1) Optimization
>  Convert conditional jumps in innermost loops to branchless equivalents
>  
> +ftree-loop-if-convert-memory-writes
> +Common Report Var(flag_tree_loop_if_convert_memory_writes) Optimization
> +Also if-convert conditional jumps containing memory writes
> +
>  ; -finhibit-size-directive inhibits output of .size for ELF.
>  ; This is used only for compiling crtstuff.c,
>  ; and it may be extended to other effects
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 4d6a1af..3634069 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -383,7 +383,8 @@ Objective-C and Objective-C++ Dialects}.
>  -fstrict-aliasing -fstrict-overflow -fthread-jumps -ftracer @gol
>  -ftree-builtin-call-dce -ftree-ccp -ftree-ch -ftree-copy-prop @gol
>  -ftree-copyrename -ftree-dce -ftree-dominator-opts -ftree-dse @gol
> --ftree-forwprop -ftree-fre -ftree-loop-if-convert -ftree-loop-im @gol
> +-ftree-forwprop -ftree-fre -ftree-loop-if-convert @gol
> +-ftree-loop-if-convert-memory-writes -ftree-loop-im @gol
>  -ftree-phiprop -ftree-loop-distribution @gol
>  -ftree-loop-ivcanon -ftree-loop-linear -ftree-loop-optimize @gol
>  -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-pta -ftree-reassoc @gol
> @@ -6890,6 +6891,23 @@ the innermost loops in order to improve the ability of the
>  vectorization pass to handle these loops.  This is enabled by default
>  if vectorization is enabled.
>  
> +@item -ftree-loop-if-convert-memory-writes
> +Attempt to also if-convert conditional jumps containing memory writes.
> +This transformation can be unsafe for multi-threaded programs as it
> +transforms conditional memory writes into unconditional memory writes.
> +For example,
> +@smallexample
> +for (i = 0; i < N; i++)
> +  if (cond)
> +    A[i] = expr;
> +@end smallexample
> +would be transformed to
> +@smallexample
> +for (i = 0; i < N; i++)
> +  A[i] = cond ? expr : A[i];
> +@end smallexample
> +potentially producing data races.
> +
>  @item -ftree-loop-distribution
>  Perform loop distribution.  This flag can improve cache performance on
>  big loop bodies and allow further loop optimizations, like
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifc-4.c b/gcc/testsuite/gcc.dg/tree-ssa/ifc-4.c
> new file mode 100644
> index 0000000..beb1a0e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-4.c
> @@ -0,0 +1,53 @@
> +/* { dg-do compile } */
> +/* { dg-options "-c -O2 -ftree-vectorize -fdump-tree-ifcvt-stats" { target *-*-* } } */
> +
> +struct ht
> +{
> +  void * (*alloc_subobject) (int);
> +};
> +typedef struct cpp_reader cpp_reader;
> +typedef struct cpp_token cpp_token;
> +typedef struct cpp_macro cpp_macro;
> +enum cpp_ttype
> +{
> +    CPP_PASTE,
> +};
> +struct cpp_token {
> +  __extension__ enum cpp_ttype type : 8;
> +} cpp_comment_table;
> +struct cpp_macro {
> +  union cpp_macro_u
> +  {
> +    cpp_token * tokens;
> +  } exp;
> +  unsigned int count;
> +};
> +struct cpp_reader
> +{
> +  struct ht *hash_table;
> +};
> +create_iso_definition (cpp_reader *pfile, cpp_macro *macro)
> +{
> +  unsigned int num_extra_tokens = 0;
> +  {
> +    cpp_token *tokns =
> +      (cpp_token *) pfile->hash_table->alloc_subobject (sizeof (cpp_token)
> +							* macro->count);
> +    {
> +      cpp_token *normal_dest = tokns;
> +      cpp_token *extra_dest = tokns + macro->count - num_extra_tokens;
> +      unsigned int i;
> +      for (i = 0; i < macro->count; i++)
> +	{
> +	  if (macro->exp.tokens[i].type == CPP_PASTE)
> +	    *extra_dest++ = macro->exp.tokens[i];
> +	  else
> +	    *normal_dest++ = macro->exp.tokens[i];
> +	}
> +    }
> +  }
> +}
> +
> +/* This cannot be if-converted because the stores are to aggregate types.  */
> +/* { dg-final { scan-tree-dump-times "Applying if-conversion" 0 "ifcvt" } } */
> +/* { dg-final { cleanup-tree-dump "ifcvt" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifc-7.c b/gcc/testsuite/gcc.dg/tree-ssa/ifc-7.c
> new file mode 100644
> index 0000000..4d26dc7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-7.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-c -O2 -ftree-vectorize" { target *-*-* } } */
> +
> +typedef struct eqn_d
> +{
> +  int *coef;
> +} *eqn;
> +typedef struct omega_pb_d
> +{
> +  eqn subs;
> +} *omega_pb;
> +
> +omega_pb omega_solve_problem (omega_pb);
> +
> +omega_pb
> +omega_solve_geq (omega_pb pb, int n)
> +{
> +  int i, e;
> +  int j = 0;
> +
> +  for (e = n - 1; e >= 0; e--)
> +    if (pb->subs[e].coef[i] != pb->subs[e].coef[j])
> +      {
> +	pb->subs[e].coef[i] = j;
> +	pb->subs[e].coef[j] = i;
> +      }
> +
> +  return omega_solve_problem (pb);
> +}
> diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
> index 7058ee4..34b4159 100644
> --- a/gcc/tree-if-conv.c
> +++ b/gcc/tree-if-conv.c
> @@ -385,9 +385,12 @@ bb_with_exit_edge_p (struct loop *loop, basic_block bb)
>     and it belongs to basic block BB.
>  
>     PHI is not if-convertible if:
> -   - it has more than 2 arguments,
> -   - virtual PHI is immediately used in another PHI node,
> -   - virtual PHI on BB other than header.  */
> +   - it has more than 2 arguments.
> +
> +   When the flag_tree_loop_if_convert_memory_writes is not set, PHI is not
> +   if-convertible if:
> +   - a virtual PHI is immediately used in another PHI node,
> +   - there is a virtual PHI in a BB other than the loop->header.  */
>  
>  static bool
>  if_convertible_phi_p (struct loop *loop, basic_block bb, gimple phi)
> @@ -405,6 +408,12 @@ if_convertible_phi_p (struct loop *loop, basic_block bb, gimple phi)
>        return false;
>      }
>  
> +  if (flag_tree_loop_if_convert_memory_writes)
> +    return true;
> +
> +  /* When the flag_tree_loop_if_convert_memory_writes is not set, check
> +     that there are no memory writes in the branches of the loop to be
> +     if-converted.  */
>    if (!is_gimple_reg (SSA_NAME_VAR (gimple_phi_result (phi))))
>      {
>        imm_use_iterator imm_iter;
> @@ -413,9 +422,10 @@ if_convertible_phi_p (struct loop *loop, basic_block bb, gimple phi)
>        if (bb != loop->header)
>  	{
>  	  if (dump_file && (dump_flags & TDF_DETAILS))
> -	    fprintf (dump_file, "Virtual phi not on loop header.\n");
> +	    fprintf (dump_file, "Virtual phi not on loop->header.\n");
>  	  return false;
>  	}
> +
>        FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (phi))
>  	{
>  	  if (gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
> @@ -435,15 +445,13 @@ if_convertible_phi_p (struct loop *loop, basic_block bb, gimple phi)
>     GIMPLE_ASSIGN statement is not if-convertible if,
>     - it is not movable,
>     - it could trap,
> -   - LHS is not var decl.
> -
> -   GIMPLE_ASSIGN is part of block BB, which is inside loop LOOP.  */
> +   - LHS is not var decl.  */
>  
>  static bool
> -if_convertible_gimple_assign_stmt_p (struct loop *loop, basic_block bb,
> -    				     gimple stmt)
> +if_convertible_gimple_assign_stmt_p (gimple stmt)
>  {
>    tree lhs = gimple_assign_lhs (stmt);
> +  basic_block bb;
>  
>    if (dump_file && (dump_flags & TDF_DETAILS))
>      {
> @@ -451,6 +459,9 @@ if_convertible_gimple_assign_stmt_p (struct loop *loop, basic_block bb,
>        print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
>      }
>  
> +  if (!is_gimple_reg_type (TREE_TYPE (lhs)))
> +    return false;
> +
>    /* Some of these constrains might be too conservative.  */
>    if (stmt_ends_bb_p (stmt)
>        || gimple_has_volatile_ops (stmt)
> @@ -463,6 +474,17 @@ if_convertible_gimple_assign_stmt_p (struct loop *loop, basic_block bb,
>        return false;
>      }
>  
> +  if (flag_tree_loop_if_convert_memory_writes)
> +    {
> +      if (gimple_could_trap_p (stmt))
> +	{
> +	  if (dump_file && (dump_flags & TDF_DETAILS))
> +	    fprintf (dump_file, "tree could trap...\n");
> +	  return false;
> +	}
> +      return true;
> +    }
> +
>    if (gimple_assign_rhs_could_trap_p (stmt))
>      {
>        if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -470,9 +492,11 @@ if_convertible_gimple_assign_stmt_p (struct loop *loop, basic_block bb,
>        return false;
>      }
>  
> +  bb = gimple_bb (stmt);
> +
>    if (TREE_CODE (lhs) != SSA_NAME
> -      && bb != loop->header
> -      && !bb_with_exit_edge_p (loop, bb))
> +      && bb != bb->loop_father->header
> +      && !bb_with_exit_edge_p (bb->loop_father, bb))
>      {
>        if (dump_file && (dump_flags & TDF_DETAILS))
>  	{
> @@ -489,12 +513,10 @@ if_convertible_gimple_assign_stmt_p (struct loop *loop, basic_block bb,
>  
>     A statement is if-convertible if:
>     - it is an if-convertible GIMPLE_ASSGIN,
> -   - it is a GIMPLE_LABEL or a GIMPLE_COND.
> -
> -   STMT is inside BB, which is inside loop LOOP.  */
> +   - it is a GIMPLE_LABEL or a GIMPLE_COND.  */
>  
>  static bool
> -if_convertible_stmt_p (struct loop *loop, basic_block bb, gimple stmt)
> +if_convertible_stmt_p (gimple stmt)
>  {
>    switch (gimple_code (stmt))
>      {
> @@ -504,7 +526,7 @@ if_convertible_stmt_p (struct loop *loop, basic_block bb, gimple stmt)
>        return true;
>  
>      case GIMPLE_ASSIGN:
> -      return if_convertible_gimple_assign_stmt_p (loop, bb, stmt);
> +      return if_convertible_gimple_assign_stmt_p (stmt);
>  
>      default:
>        /* Don't know what to do with 'em so don't do anything.  */
> @@ -875,7 +897,7 @@ if_convertible_loop_p (struct loop *loop)
>  	continue;
>  
>        for (itr = gsi_start_bb (bb); !gsi_end_p (itr); gsi_next (&itr))
> -	if (!if_convertible_stmt_p (loop, bb, gsi_stmt (itr)))
> +	if (!if_convertible_stmt_p (gsi_stmt (itr)))
>  	  return false;
>      }
>  
> @@ -895,7 +917,7 @@ if_convertible_loop_p (struct loop *loop)
>  static basic_block
>  find_phi_replacement_condition (struct loop *loop,
>  				basic_block bb, tree *cond,
> -                                gimple_stmt_iterator *gsi)
> +				gimple_stmt_iterator *gsi)
>  {
>    edge first_edge, second_edge;
>    tree tmp_cond;
> @@ -980,8 +1002,9 @@ find_phi_replacement_condition (struct loop *loop,
>    return first_edge->src;
>  }
>  
> -/* Replace PHI node with conditional modify expr using COND.  This
> -   routine does not handle PHI nodes with more than two arguments.
> +/* Replace a scalar PHI node with a COND_EXPR using COND as condition.
> +   This routine does not handle PHI nodes with more than two
> +   arguments.
>  
>     For example,
>       S1: A = PHI <x1(1), x2(5)
> @@ -993,18 +1016,22 @@ find_phi_replacement_condition (struct loop *loop,
>     TRUE_BB is selected.  */
>  
>  static void
> -replace_phi_with_cond_gimple_assign_stmt (gimple phi, tree cond,
> -    					  basic_block true_bb,
> -                                   	  gimple_stmt_iterator *gsi)
> +predicate_scalar_phi (gimple phi, tree cond,
> +		      basic_block true_bb,
> +		      gimple_stmt_iterator *gsi)
>  {
>    gimple new_stmt;
>    basic_block bb;
> -  tree rhs;
> -  tree arg;
> +  tree rhs, res, arg;
>  
>    gcc_assert (gimple_code (phi) == GIMPLE_PHI
>  	      && gimple_phi_num_args (phi) == 2);
>  
> +  res = gimple_phi_result (phi);
> +  /* Do not handle virtual phi nodes.  */
> +  if (!is_gimple_reg (SSA_NAME_VAR (res)))
> +    return;
> +
>    bb = gimple_bb (phi);
>  
>    arg = degenerate_phi_result (phi);
> @@ -1026,11 +1053,11 @@ replace_phi_with_cond_gimple_assign_stmt (gimple phi, tree cond,
>  	}
>  
>        /* Build new RHS using selected condition and arguments.  */
> -      rhs = build3 (COND_EXPR, TREE_TYPE (PHI_RESULT (phi)),
> +      rhs = build3 (COND_EXPR, TREE_TYPE (res),
>  		    unshare_expr (cond), arg_0, arg_1);
>      }
>  
> -  new_stmt = gimple_build_assign (PHI_RESULT (phi), rhs);
> +  new_stmt = gimple_build_assign (res, rhs);
>    SSA_NAME_DEF_STMT (gimple_phi_result (phi)) = new_stmt;
>    gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
>    update_stmt (new_stmt);
> @@ -1042,11 +1069,11 @@ replace_phi_with_cond_gimple_assign_stmt (gimple phi, tree cond,
>      }
>  }
>  
> -/* Replaces in LOOP all the phi nodes other than those in the
> +/* Replaces in LOOP all the scalar phi nodes other than those in the
>     LOOP->header block with conditional modify expressions.  */
>  
>  static void
> -ifconvert_phi_nodes (struct loop *loop)
> +predicate_all_scalar_phis (struct loop *loop)
>  {
>    basic_block bb;
>    unsigned int orig_loop_num_nodes = loop->num_nodes;
> @@ -1075,7 +1102,7 @@ ifconvert_phi_nodes (struct loop *loop)
>        while (!gsi_end_p (phi_gsi))
>  	{
>  	  phi = gsi_stmt (phi_gsi);
> -	  replace_phi_with_cond_gimple_assign_stmt (phi, cond, true_bb, &gsi);
> +	  predicate_scalar_phi (phi, cond, true_bb, &gsi);
>  	  release_phi_node (phi);
>  	  gsi_next (&phi_gsi);
>  	}
> @@ -1095,7 +1122,7 @@ insert_gimplified_predicates (loop_p loop)
>    for (i = 0; i < loop->num_nodes; i++)
>      {
>        basic_block bb = ifc_bbs[i];
> -      gimple_seq stmts = bb_predicate_gimplified_stmts (bb);
> +      gimple_seq stmts;
>  
>        if (!is_predicated (bb))
>  	{
> @@ -1106,15 +1133,30 @@ insert_gimplified_predicates (loop_p loop)
>  	  continue;
>  	}
>  
> +      stmts = bb_predicate_gimplified_stmts (bb);
>        if (stmts)
>  	{
> -	  gimple_stmt_iterator gsi = gsi_last_bb (bb);
> -
> -	  if (gsi_end_p (gsi)
> -	      || gimple_code (gsi_stmt (gsi)) == GIMPLE_COND)
> -	    gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
> +	  if (flag_tree_loop_if_convert_memory_writes)
> +	    {
> +	      /* Insert the predicate of the BB just after the label,
> +		 as the if-conversion of memory writes will use this
> +		 predicate.  */
> +	      gimple_stmt_iterator gsi = gsi_after_labels (bb);
> +	      gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
> +	    }
>  	  else
> -	    gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
> +	    {
> +	      /* Insert the predicate of the BB at the end of the BB
> +		 as this would reduce the register pressure: the only
> +		 use of this predicate will be in successor BBs.  */
> +	      gimple_stmt_iterator gsi = gsi_last_bb (bb);
> +
> +	      if (gsi_end_p (gsi)
> +		  || gimple_code (gsi_stmt (gsi)) == GIMPLE_COND)

  || stmt_ends_bb_p (gsi_stmt (gsi))

> +		gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
> +	      else
> +		gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
> +	    }

It's not clear to me why the case for 
flag_tree_loop_if_convert_memory_writes is special.

>  	  /* Once the sequence is code generated, set it to NULL.  */
>  	  set_bb_predicate_gimplified_stmts (bb, NULL);
> @@ -1122,6 +1164,56 @@ insert_gimplified_predicates (loop_p loop)
>      }
>  }
>  
> +/* Predicate each write to memory in LOOP.
> +
> +   Replace a statement "A[i] = foo" with "A[i] = cond ? foo : A[i]"
> +   with the condition COND determined from the predicate of the basic
> +   block containing the statement.  */

Instead of writing A[i] = cond ? foo : A[i] in the comment, can you
explain the CFG transformation here, especially the placement of
the load and the store?  See above - this probably would have
made things clearer to me - in the function below there are no
comments either.

> +static void
> +predicate_mem_writes (loop_p loop)
> +{
> +  unsigned int i, orig_loop_num_nodes = loop->num_nodes;
> +
> +  for (i = 1; i < orig_loop_num_nodes; i++)
> +    {
> +      gimple_stmt_iterator gsi;
> +      basic_block bb = ifc_bbs[i];
> +      tree cond = bb_predicate (bb);
> +
> +      if (is_true_predicate (cond))
> +	continue;
> +
> +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))

So this is another loop over all loops and statements.  Why not
do this loop once and dispatch to mem/scalar if-conversion there?

> +	if (is_gimple_assign (gsi_stmt (gsi))

Stores always are gimple_assign_single_p, so that's a better predicate...

> +	    && gimple_vdef (gsi_stmt (gsi)))
> +	  {
> +	    gimple new_stmt, lhs_stmt, rhs_stmt;
> +	    gimple stmt = gsi_stmt (gsi);
> +	    tree lhs = gimple_get_lhs (stmt);
> +	    tree rhs = gimple_op (stmt, 1);

Please use gimple_assign_lhs and gimple_assign_rhs1.

> +
> +	    gcc_assert (gimple_num_ops (stmt) == 2);

... and makes this a useless assert (it's too lowlevel anyway).

> +
> +	    rhs_stmt = ifc_temp_var (TREE_TYPE (rhs), unshare_expr (rhs));
> +	    gsi_insert_before (&gsi, rhs_stmt, GSI_SAME_STMT);
> +	    rhs = gimple_get_lhs (rhs_stmt);

gimple_assign_lhs.  The ifc_temp_var helper doesn't look very well
designed - why not pass it a gsi to insert the stmt and return
the SSA name temporary instead?

> +
> +	    lhs_stmt = ifc_temp_var (TREE_TYPE (lhs), unshare_expr (lhs));
> +	    gsi_insert_before (&gsi, lhs_stmt, GSI_SAME_STMT);
> +	    lhs = gimple_get_lhs (lhs_stmt);

You should use the same type for lhs_stmt and rhs_stmt temporary,
in fact the type of the lhs is the correct one to use
(useless_type_conversion (lhs_type, rhs_type) is known to be true).

> +	    cond = unshare_expr (cond);
> +	    rhs = build3 (COND_EXPR, TREE_TYPE (lhs), cond, rhs, lhs);
> +
> +	    new_stmt = ifc_temp_var (TREE_TYPE (lhs), rhs);
> +	    gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
> +	    gimple_set_op (stmt, 1, gimple_get_lhs (new_stmt));

gimple_assign_set_rhs1.

So you create this series of stmts in place of the conditional store,
which means I don't see why materializing the condition before
the cond-expr of the dominator does not work.

> +	    update_stmt (stmt);
> +	  }
> +    }
> +}
> +
>  /* Remove all GIMPLE_CONDs and GIMPLE_LABELs of all the basic blocks
>     other than the exit and latch of the LOOP.  Also resets the
>     GIMPLE_DEBUG information.  */
> @@ -1178,7 +1270,10 @@ combine_blocks (struct loop *loop)
>  
>    remove_conditions_and_labels (loop);
>    insert_gimplified_predicates (loop);
> -  ifconvert_phi_nodes (loop);
> +  predicate_all_scalar_phis (loop);
> +
> +  if (flag_tree_loop_if_convert_memory_writes)
> +    predicate_mem_writes (loop);

As suggested above, predicate_all_scalar_phis and predicate_mem_writes
should loop over all loops/bbs once.

>    /* Merge basic blocks: first remove all the edges in the loop,
>       except for those from the exit block.  */
> @@ -1280,6 +1375,10 @@ tree_if_conversion (struct loop *loop)
>       blocks into one huge basic block doing the if-conversion
>       on-the-fly.  */
>    combine_blocks (loop);
> +
> +  if (flag_tree_loop_if_convert_memory_writes)
> +    mark_sym_for_renaming (gimple_vop (cfun));

That should have happened automatically (and in fact is not necessary
if you update virtual SSA form properly - but you can leave that
for a followup, just remove that stmt).

>    changed = true;
>  
>   cleanup:
> @@ -1312,6 +1411,9 @@ main_tree_if_conversion (void)
>    FOR_EACH_LOOP (li, loop, 0)
>      changed |= tree_if_conversion (loop);
>  
> +  if (changed && flag_tree_loop_if_convert_memory_writes)
> +    update_ssa (TODO_update_ssa);

Please return

TODO_update_ssa_only_virtuals

instead.

Please update the patch with the suggested changes and clarifications.

Thanks,
Richard.

>    return changed ? TODO_cleanup_cfg : 0;
>  }
>  
> @@ -1321,7 +1423,8 @@ static bool
>  gate_tree_if_conversion (void)
>  {
>    return ((flag_tree_vectorize && flag_tree_loop_if_convert != 0)
> -	  || flag_tree_loop_if_convert == 1);
> +	  || flag_tree_loop_if_convert == 1
> +	  || flag_tree_loop_if_convert_memory_writes == 1);
>  }
>  
>  struct gimple_opt_pass pass_if_conversion =
Sebastian Pop Aug. 13, 2010, 8:33 p.m. UTC | #2
On Fri, Aug 13, 2010 at 03:54, Richard Guenther <rguenther@suse.de> wrote:
>> +       if (flag_tree_loop_if_convert_memory_writes)
>> +         {
>> +           /* Insert the predicate of the BB just after the label,
>> +              as the if-conversion of memory writes will use this
>> +              predicate.  */
>> +           gimple_stmt_iterator gsi = gsi_after_labels (bb);
>> +           gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
>> +         }
>>         else
>> -         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
>> +         {
>> +           /* Insert the predicate of the BB at the end of the BB
>> +              as this would reduce the register pressure: the only
>> +              use of this predicate will be in successor BBs.  */
>> +           gimple_stmt_iterator gsi = gsi_last_bb (bb);
>> +
>> +           if (gsi_end_p (gsi)
>> +               || gimple_code (gsi_stmt (gsi)) == GIMPLE_COND)
>
>  || stmt_ends_bb_p (gsi_stmt (gsi))
>
>> +             gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
>> +           else
>> +             gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
>> +         }
>
> It's not clear to me why the case for
> flag_tree_loop_if_convert_memory_writes is special.
>
>>         /* Once the sequence is code generated, set it to NULL.  */
>>         set_bb_predicate_gimplified_stmts (bb, NULL);
>> @@ -1122,6 +1164,56 @@ insert_gimplified_predicates (loop_p loop)
>>      }
>>  }
>>
>> +/* Predicate each write to memory in LOOP.
>> +
>> +   Replace a statement "A[i] = foo" with "A[i] = cond ? foo : A[i]"
>> +   with the condition COND determined from the predicate of the basic
>> +   block containing the statement.  */
>
> Instead of writing A[i] = cond ? foo : A[i] in the comment, can you
> explain the CFG transformation here, especially the placement of
> the load and the store?  See above - this probably would have
> made things clearer to me - in the function below there are no
> comments either.
>

Let me know if the explanation below can be improved.

This function transforms control flow constructs containing memory
writes of the form

| for (i = 0; i < N; i++)
|   if (cond)
|     A[i] = expr;

into the following form that does not contain control flow:

| for (i = 0; i < N; i++)
|   A[i] = cond ? expr : A[i];

The original CFG looks like this:

| bb_0
|   i = 0
| end_bb_0
|
| bb_1
|   if (i < N) goto bb_5 else goto bb_2
| end_bb_1
|
| bb_2
|   cond = some_computation;
|   if (cond) goto bb_3 else goto bb_4
| end_bb_2
|
| bb_3
|   A[i] = expr;
|   goto bb_4
| end_bb_3
|
| bb_4
|   goto bb_1
| end_bb_4

insert_gimplified_predicates inserts the computation of the COND
expression at the beginning of the destination basic block:

| bb_0
|   i = 0
| end_bb_0
|
| bb_1
|   if (i < N) goto bb_5 else goto bb_2
| end_bb_1
|
| bb_2
|   cond = some_computation;
|   if (cond) goto bb_3 else goto bb_4
| end_bb_2
|
| bb_3
|   cond = some_computation;
|   A[i] = expr;
|   goto bb_4
| end_bb_3
|
| bb_4
|   goto bb_1
| end_bb_4

predicate_mem_writes is then predicating the memory write as follows:

| bb_0
|   i = 0
| end_bb_0
|
| bb_1
|   if (i < N) goto bb_5 else goto bb_2
| end_bb_1
|
| bb_2
|   if (cond) goto bb_3 else goto bb_4
| end_bb_2
|
| bb_3
|   cond = some_computation;
|   A[i] = cond ? expr : A[i];
|   goto bb_4
| end_bb_3
|
| bb_4
|   goto bb_1
| end_bb_4

and finally combine_blocks removes the basic block boundaries making
the loop vectorizable:

| bb_0
|   i = 0
|   if (i < N) goto bb_5 else goto bb_1
| end_bb_0
|
| bb_1
|   cond = some_computation;
|   A[i] = cond ? expr : A[i];
|   if (i < N) goto bb_5 else goto bb_4
| end_bb_1
|
| bb_4
|   goto bb_1
| end_bb_4
Sebastian Pop Aug. 13, 2010, 10:10 p.m. UTC | #3
On Fri, Aug 13, 2010 at 03:54, Richard Guenther <rguenther@suse.de> wrote:
> So you create this series of stmts in place of the conditional store,
> which means I don't see why materializing the condition before
> the cond-expr of the dominator does not work.
>

Yes, this could work.  Inserting the predicate computations anywhere
before the predicated BB is fine.  The reason why we have to insert
the predicates before or at the beginning of the BB in the case of mem
writes if-conversion is that we need the predicates to build the
conditional move statements.

For scalar if-conversion, we need the predicates as close as possible
to the phi nodes that merge the scalar values from different branches
in order to have low register pressure.

Sebastian
Richard Biener Aug. 16, 2010, 9:42 a.m. UTC | #4
On Fri, 13 Aug 2010, Sebastian Pop wrote:

> On Fri, Aug 13, 2010 at 03:54, Richard Guenther <rguenther@suse.de> wrote:
> >> +       if (flag_tree_loop_if_convert_memory_writes)
> >> +         {
> >> +           /* Insert the predicate of the BB just after the label,
> >> +              as the if-conversion of memory writes will use this
> >> +              predicate.  */
> >> +           gimple_stmt_iterator gsi = gsi_after_labels (bb);
> >> +           gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
> >> +         }
> >>         else
> >> -         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
> >> +         {
> >> +           /* Insert the predicate of the BB at the end of the BB
> >> +              as this would reduce the register pressure: the only
> >> +              use of this predicate will be in successor BBs.  */
> >> +           gimple_stmt_iterator gsi = gsi_last_bb (bb);
> >> +
> >> +           if (gsi_end_p (gsi)
> >> +               || gimple_code (gsi_stmt (gsi)) == GIMPLE_COND)
> >
> >  || stmt_ends_bb_p (gsi_stmt (gsi))
> >
> >> +             gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
> >> +           else
> >> +             gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
> >> +         }
> >
> > It's not clear to me why the case for
> > flag_tree_loop_if_convert_memory_writes is special.
> >
> >>         /* Once the sequence is code generated, set it to NULL.  */
> >>         set_bb_predicate_gimplified_stmts (bb, NULL);
> >> @@ -1122,6 +1164,56 @@ insert_gimplified_predicates (loop_p loop)
> >>      }
> >>  }
> >>
> >> +/* Predicate each write to memory in LOOP.
> >> +
> >> +   Replace a statement "A[i] = foo" with "A[i] = cond ? foo : A[i]"
> >> +   with the condition COND determined from the predicate of the basic
> >> +   block containing the statement.  */
> >
> > Instead of writing A[i] = cond ? foo : A[i] in the comment, can you
> > explain the CFG transformation here, especially the placement of
> > the load and the store?  See above - this probably would have
> > made things clearer to me - in the function below there are no
> > comments either.
> >
> 
> Let me know if the explanation below can be improved.

That looks good.

> This function transforms control flow constructs containing memory
> writes of the form
> 
> | for (i = 0; i < N; i++)
> |   if (cond)
> |     A[i] = expr;
> 
> into the following form that does not contain control flow:
> 
> | for (i = 0; i < N; i++)
> |   A[i] = cond ? expr : A[i];
> 
> The original CFG looks like this:
> 
> | bb_0
> |   i = 0
> | end_bb_0
> |
> | bb_1
> |   if (i < N) goto bb_5 else goto bb_2
> | end_bb_1
> |
> | bb_2
> |   cond = some_computation;
> |   if (cond) goto bb_3 else goto bb_4
> | end_bb_2
> |
> | bb_3
> |   A[i] = expr;
> |   goto bb_4
> | end_bb_3
> |
> | bb_4
> |   goto bb_1
> | end_bb_4
> 
> insert_gimplified_predicates inserts the computation of the COND
> expression at the beginning of the destination basic block:
> 
> | bb_0
> |   i = 0
> | end_bb_0
> |
> | bb_1
> |   if (i < N) goto bb_5 else goto bb_2
> | end_bb_1
> |
> | bb_2
> |   cond = some_computation;
> |   if (cond) goto bb_3 else goto bb_4
> | end_bb_2
> |
> | bb_3
> |   cond = some_computation;
> |   A[i] = expr;
> |   goto bb_4
> | end_bb_3
> |
> | bb_4
> |   goto bb_1
> | end_bb_4
> 
> predicate_mem_writes is then predicating the memory write as follows:
> 
> | bb_0
> |   i = 0
> | end_bb_0
> |
> | bb_1
> |   if (i < N) goto bb_5 else goto bb_2
> | end_bb_1
> |
> | bb_2
> |   if (cond) goto bb_3 else goto bb_4
> | end_bb_2
> |
> | bb_3
> |   cond = some_computation;
> |   A[i] = cond ? expr : A[i];
> |   goto bb_4
> | end_bb_3
> |
> | bb_4
> |   goto bb_1
> | end_bb_4
> 
> and finally combine_blocks removes the basic block boundaries making
> the loop vectorizable:
> 
> | bb_0
> |   i = 0
> |   if (i < N) goto bb_5 else goto bb_1
> | end_bb_0
> |
> | bb_1
> |   cond = some_computation;
> |   A[i] = cond ? expr : A[i];
> |   if (i < N) goto bb_5 else goto bb_4
> | end_bb_1
> |
> | bb_4
> |   goto bb_1
> | end_bb_4
>
Richard Biener Aug. 16, 2010, 9:46 a.m. UTC | #5
On Fri, 13 Aug 2010, Sebastian Pop wrote:

> On Fri, Aug 13, 2010 at 03:54, Richard Guenther <rguenther@suse.de> wrote:
> > So you create this series of stmts in place of the conditional store,
> > which means I don't see why materializing the condition before
> > the cond-expr of the dominator does not work.
> >
> 
> Yes, this could work.  Inserting the predicate computations anywhere
> before the predicated BB is fine.  The reason why we have to insert
> the predicates before or at the beginning of the BB in the case of mem
> writes if-conversion is that we need the predicates to build the
> conditional move statements.
>
> For scalar if-conversion, we need the predicates as close as possible
> to the phi nodes that merge the scalar values from different branches
> in order to have low register pressure.

Ok, I see.

Richard.
diff mbox

Patch

diff --git a/gcc/common.opt b/gcc/common.opt
index 111d7b7..ceb94f8 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -657,6 +657,10 @@  ftree-loop-if-convert
 Common Report Var(flag_tree_loop_if_convert) Init(-1) Optimization
 Convert conditional jumps in innermost loops to branchless equivalents
 
+ftree-loop-if-convert-memory-writes
+Common Report Var(flag_tree_loop_if_convert_memory_writes) Optimization
+Also if-convert conditional jumps containing memory writes
+
 ; -finhibit-size-directive inhibits output of .size for ELF.
 ; This is used only for compiling crtstuff.c,
 ; and it may be extended to other effects
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 4d6a1af..3634069 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -383,7 +383,8 @@  Objective-C and Objective-C++ Dialects}.
 -fstrict-aliasing -fstrict-overflow -fthread-jumps -ftracer @gol
 -ftree-builtin-call-dce -ftree-ccp -ftree-ch -ftree-copy-prop @gol
 -ftree-copyrename -ftree-dce -ftree-dominator-opts -ftree-dse @gol
--ftree-forwprop -ftree-fre -ftree-loop-if-convert -ftree-loop-im @gol
+-ftree-forwprop -ftree-fre -ftree-loop-if-convert @gol
+-ftree-loop-if-convert-memory-writes -ftree-loop-im @gol
 -ftree-phiprop -ftree-loop-distribution @gol
 -ftree-loop-ivcanon -ftree-loop-linear -ftree-loop-optimize @gol
 -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-pta -ftree-reassoc @gol
@@ -6890,6 +6891,23 @@  the innermost loops in order to improve the ability of the
 vectorization pass to handle these loops.  This is enabled by default
 if vectorization is enabled.
 
+@item -ftree-loop-if-convert-memory-writes
+Attempt to also if-convert conditional jumps containing memory writes.
+This transformation can be unsafe for multi-threaded programs as it
+transforms conditional memory writes into unconditional memory writes.
+For example,
+@smallexample
+for (i = 0; i < N; i++)
+  if (cond)
+    A[i] = expr;
+@end smallexample
+would be transformed to
+@smallexample
+for (i = 0; i < N; i++)
+  A[i] = cond ? expr : A[i];
+@end smallexample
+potentially producing data races.
+
 @item -ftree-loop-distribution
 Perform loop distribution.  This flag can improve cache performance on
 big loop bodies and allow further loop optimizations, like
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifc-4.c b/gcc/testsuite/gcc.dg/tree-ssa/ifc-4.c
new file mode 100644
index 0000000..beb1a0e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-4.c
@@ -0,0 +1,53 @@ 
+/* { dg-do compile } */
+/* { dg-options "-c -O2 -ftree-vectorize -fdump-tree-ifcvt-stats" { target *-*-* } } */
+
+struct ht
+{
+  void * (*alloc_subobject) (int);
+};
+typedef struct cpp_reader cpp_reader;
+typedef struct cpp_token cpp_token;
+typedef struct cpp_macro cpp_macro;
+enum cpp_ttype
+{
+    CPP_PASTE,
+};
+struct cpp_token {
+  __extension__ enum cpp_ttype type : 8;
+} cpp_comment_table;
+struct cpp_macro {
+  union cpp_macro_u
+  {
+    cpp_token * tokens;
+  } exp;
+  unsigned int count;
+};
+struct cpp_reader
+{
+  struct ht *hash_table;
+};
+create_iso_definition (cpp_reader *pfile, cpp_macro *macro)
+{
+  unsigned int num_extra_tokens = 0;
+  {
+    cpp_token *tokns =
+      (cpp_token *) pfile->hash_table->alloc_subobject (sizeof (cpp_token)
+							* macro->count);
+    {
+      cpp_token *normal_dest = tokns;
+      cpp_token *extra_dest = tokns + macro->count - num_extra_tokens;
+      unsigned int i;
+      for (i = 0; i < macro->count; i++)
+	{
+	  if (macro->exp.tokens[i].type == CPP_PASTE)
+	    *extra_dest++ = macro->exp.tokens[i];
+	  else
+	    *normal_dest++ = macro->exp.tokens[i];
+	}
+    }
+  }
+}
+
+/* This cannot be if-converted because the stores are to aggregate types.  */
+/* { dg-final { scan-tree-dump-times "Applying if-conversion" 0 "ifcvt" } } */
+/* { dg-final { cleanup-tree-dump "ifcvt" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ifc-7.c b/gcc/testsuite/gcc.dg/tree-ssa/ifc-7.c
new file mode 100644
index 0000000..4d26dc7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ifc-7.c
@@ -0,0 +1,29 @@ 
+/* { dg-do compile } */
+/* { dg-options "-c -O2 -ftree-vectorize" { target *-*-* } } */
+
+typedef struct eqn_d
+{
+  int *coef;
+} *eqn;
+typedef struct omega_pb_d
+{
+  eqn subs;
+} *omega_pb;
+
+omega_pb omega_solve_problem (omega_pb);
+
+omega_pb
+omega_solve_geq (omega_pb pb, int n)
+{
+  int i, e;
+  int j = 0;
+
+  for (e = n - 1; e >= 0; e--)
+    if (pb->subs[e].coef[i] != pb->subs[e].coef[j])
+      {
+	pb->subs[e].coef[i] = j;
+	pb->subs[e].coef[j] = i;
+      }
+
+  return omega_solve_problem (pb);
+}
diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index 7058ee4..34b4159 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -385,9 +385,12 @@  bb_with_exit_edge_p (struct loop *loop, basic_block bb)
    and it belongs to basic block BB.
 
    PHI is not if-convertible if:
-   - it has more than 2 arguments,
-   - virtual PHI is immediately used in another PHI node,
-   - virtual PHI on BB other than header.  */
+   - it has more than 2 arguments.
+
+   When the flag_tree_loop_if_convert_memory_writes is not set, PHI is not
+   if-convertible if:
+   - a virtual PHI is immediately used in another PHI node,
+   - there is a virtual PHI in a BB other than the loop->header.  */
 
 static bool
 if_convertible_phi_p (struct loop *loop, basic_block bb, gimple phi)
@@ -405,6 +408,12 @@  if_convertible_phi_p (struct loop *loop, basic_block bb, gimple phi)
       return false;
     }
 
+  if (flag_tree_loop_if_convert_memory_writes)
+    return true;
+
+  /* When the flag_tree_loop_if_convert_memory_writes is not set, check
+     that there are no memory writes in the branches of the loop to be
+     if-converted.  */
   if (!is_gimple_reg (SSA_NAME_VAR (gimple_phi_result (phi))))
     {
       imm_use_iterator imm_iter;
@@ -413,9 +422,10 @@  if_convertible_phi_p (struct loop *loop, basic_block bb, gimple phi)
       if (bb != loop->header)
 	{
 	  if (dump_file && (dump_flags & TDF_DETAILS))
-	    fprintf (dump_file, "Virtual phi not on loop header.\n");
+	    fprintf (dump_file, "Virtual phi not on loop->header.\n");
 	  return false;
 	}
+
       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (phi))
 	{
 	  if (gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
@@ -435,15 +445,13 @@  if_convertible_phi_p (struct loop *loop, basic_block bb, gimple phi)
    GIMPLE_ASSIGN statement is not if-convertible if,
    - it is not movable,
    - it could trap,
-   - LHS is not var decl.
-
-   GIMPLE_ASSIGN is part of block BB, which is inside loop LOOP.  */
+   - LHS is not var decl.  */
 
 static bool
-if_convertible_gimple_assign_stmt_p (struct loop *loop, basic_block bb,
-    				     gimple stmt)
+if_convertible_gimple_assign_stmt_p (gimple stmt)
 {
   tree lhs = gimple_assign_lhs (stmt);
+  basic_block bb;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
     {
@@ -451,6 +459,9 @@  if_convertible_gimple_assign_stmt_p (struct loop *loop, basic_block bb,
       print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
     }
 
+  if (!is_gimple_reg_type (TREE_TYPE (lhs)))
+    return false;
+
   /* Some of these constrains might be too conservative.  */
   if (stmt_ends_bb_p (stmt)
       || gimple_has_volatile_ops (stmt)
@@ -463,6 +474,17 @@  if_convertible_gimple_assign_stmt_p (struct loop *loop, basic_block bb,
       return false;
     }
 
+  if (flag_tree_loop_if_convert_memory_writes)
+    {
+      if (gimple_could_trap_p (stmt))
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    fprintf (dump_file, "tree could trap...\n");
+	  return false;
+	}
+      return true;
+    }
+
   if (gimple_assign_rhs_could_trap_p (stmt))
     {
       if (dump_file && (dump_flags & TDF_DETAILS))
@@ -470,9 +492,11 @@  if_convertible_gimple_assign_stmt_p (struct loop *loop, basic_block bb,
       return false;
     }
 
+  bb = gimple_bb (stmt);
+
   if (TREE_CODE (lhs) != SSA_NAME
-      && bb != loop->header
-      && !bb_with_exit_edge_p (loop, bb))
+      && bb != bb->loop_father->header
+      && !bb_with_exit_edge_p (bb->loop_father, bb))
     {
       if (dump_file && (dump_flags & TDF_DETAILS))
 	{
@@ -489,12 +513,10 @@  if_convertible_gimple_assign_stmt_p (struct loop *loop, basic_block bb,
 
    A statement is if-convertible if:
    - it is an if-convertible GIMPLE_ASSGIN,
-   - it is a GIMPLE_LABEL or a GIMPLE_COND.
-
-   STMT is inside BB, which is inside loop LOOP.  */
+   - it is a GIMPLE_LABEL or a GIMPLE_COND.  */
 
 static bool
-if_convertible_stmt_p (struct loop *loop, basic_block bb, gimple stmt)
+if_convertible_stmt_p (gimple stmt)
 {
   switch (gimple_code (stmt))
     {
@@ -504,7 +526,7 @@  if_convertible_stmt_p (struct loop *loop, basic_block bb, gimple stmt)
       return true;
 
     case GIMPLE_ASSIGN:
-      return if_convertible_gimple_assign_stmt_p (loop, bb, stmt);
+      return if_convertible_gimple_assign_stmt_p (stmt);
 
     default:
       /* Don't know what to do with 'em so don't do anything.  */
@@ -875,7 +897,7 @@  if_convertible_loop_p (struct loop *loop)
 	continue;
 
       for (itr = gsi_start_bb (bb); !gsi_end_p (itr); gsi_next (&itr))
-	if (!if_convertible_stmt_p (loop, bb, gsi_stmt (itr)))
+	if (!if_convertible_stmt_p (gsi_stmt (itr)))
 	  return false;
     }
 
@@ -895,7 +917,7 @@  if_convertible_loop_p (struct loop *loop)
 static basic_block
 find_phi_replacement_condition (struct loop *loop,
 				basic_block bb, tree *cond,
-                                gimple_stmt_iterator *gsi)
+				gimple_stmt_iterator *gsi)
 {
   edge first_edge, second_edge;
   tree tmp_cond;
@@ -980,8 +1002,9 @@  find_phi_replacement_condition (struct loop *loop,
   return first_edge->src;
 }
 
-/* Replace PHI node with conditional modify expr using COND.  This
-   routine does not handle PHI nodes with more than two arguments.
+/* Replace a scalar PHI node with a COND_EXPR using COND as condition.
+   This routine does not handle PHI nodes with more than two
+   arguments.
 
    For example,
      S1: A = PHI <x1(1), x2(5)
@@ -993,18 +1016,22 @@  find_phi_replacement_condition (struct loop *loop,
    TRUE_BB is selected.  */
 
 static void
-replace_phi_with_cond_gimple_assign_stmt (gimple phi, tree cond,
-    					  basic_block true_bb,
-                                   	  gimple_stmt_iterator *gsi)
+predicate_scalar_phi (gimple phi, tree cond,
+		      basic_block true_bb,
+		      gimple_stmt_iterator *gsi)
 {
   gimple new_stmt;
   basic_block bb;
-  tree rhs;
-  tree arg;
+  tree rhs, res, arg;
 
   gcc_assert (gimple_code (phi) == GIMPLE_PHI
 	      && gimple_phi_num_args (phi) == 2);
 
+  res = gimple_phi_result (phi);
+  /* Do not handle virtual phi nodes.  */
+  if (!is_gimple_reg (SSA_NAME_VAR (res)))
+    return;
+
   bb = gimple_bb (phi);
 
   arg = degenerate_phi_result (phi);
@@ -1026,11 +1053,11 @@  replace_phi_with_cond_gimple_assign_stmt (gimple phi, tree cond,
 	}
 
       /* Build new RHS using selected condition and arguments.  */
-      rhs = build3 (COND_EXPR, TREE_TYPE (PHI_RESULT (phi)),
+      rhs = build3 (COND_EXPR, TREE_TYPE (res),
 		    unshare_expr (cond), arg_0, arg_1);
     }
 
-  new_stmt = gimple_build_assign (PHI_RESULT (phi), rhs);
+  new_stmt = gimple_build_assign (res, rhs);
   SSA_NAME_DEF_STMT (gimple_phi_result (phi)) = new_stmt;
   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
   update_stmt (new_stmt);
@@ -1042,11 +1069,11 @@  replace_phi_with_cond_gimple_assign_stmt (gimple phi, tree cond,
     }
 }
 
-/* Replaces in LOOP all the phi nodes other than those in the
+/* Replaces in LOOP all the scalar phi nodes other than those in the
    LOOP->header block with conditional modify expressions.  */
 
 static void
-ifconvert_phi_nodes (struct loop *loop)
+predicate_all_scalar_phis (struct loop *loop)
 {
   basic_block bb;
   unsigned int orig_loop_num_nodes = loop->num_nodes;
@@ -1075,7 +1102,7 @@  ifconvert_phi_nodes (struct loop *loop)
       while (!gsi_end_p (phi_gsi))
 	{
 	  phi = gsi_stmt (phi_gsi);
-	  replace_phi_with_cond_gimple_assign_stmt (phi, cond, true_bb, &gsi);
+	  predicate_scalar_phi (phi, cond, true_bb, &gsi);
 	  release_phi_node (phi);
 	  gsi_next (&phi_gsi);
 	}
@@ -1095,7 +1122,7 @@  insert_gimplified_predicates (loop_p loop)
   for (i = 0; i < loop->num_nodes; i++)
     {
       basic_block bb = ifc_bbs[i];
-      gimple_seq stmts = bb_predicate_gimplified_stmts (bb);
+      gimple_seq stmts;
 
       if (!is_predicated (bb))
 	{
@@ -1106,15 +1133,30 @@  insert_gimplified_predicates (loop_p loop)
 	  continue;
 	}
 
+      stmts = bb_predicate_gimplified_stmts (bb);
       if (stmts)
 	{
-	  gimple_stmt_iterator gsi = gsi_last_bb (bb);
-
-	  if (gsi_end_p (gsi)
-	      || gimple_code (gsi_stmt (gsi)) == GIMPLE_COND)
-	    gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
+	  if (flag_tree_loop_if_convert_memory_writes)
+	    {
+	      /* Insert the predicate of the BB just after the label,
+		 as the if-conversion of memory writes will use this
+		 predicate.  */
+	      gimple_stmt_iterator gsi = gsi_after_labels (bb);
+	      gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
+	    }
 	  else
-	    gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
+	    {
+	      /* Insert the predicate of the BB at the end of the BB
+		 as this would reduce the register pressure: the only
+		 use of this predicate will be in successor BBs.  */
+	      gimple_stmt_iterator gsi = gsi_last_bb (bb);
+
+	      if (gsi_end_p (gsi)
+		  || gimple_code (gsi_stmt (gsi)) == GIMPLE_COND)
+		gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
+	      else
+		gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
+	    }
 
 	  /* Once the sequence is code generated, set it to NULL.  */
 	  set_bb_predicate_gimplified_stmts (bb, NULL);
@@ -1122,6 +1164,56 @@  insert_gimplified_predicates (loop_p loop)
     }
 }
 
+/* Predicate each write to memory in LOOP.
+
+   Replace a statement "A[i] = foo" with "A[i] = cond ? foo : A[i]"
+   with the condition COND determined from the predicate of the basic
+   block containing the statement.  */
+
+static void
+predicate_mem_writes (loop_p loop)
+{
+  unsigned int i, orig_loop_num_nodes = loop->num_nodes;
+
+  for (i = 1; i < orig_loop_num_nodes; i++)
+    {
+      gimple_stmt_iterator gsi;
+      basic_block bb = ifc_bbs[i];
+      tree cond = bb_predicate (bb);
+
+      if (is_true_predicate (cond))
+	continue;
+
+      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+	if (is_gimple_assign (gsi_stmt (gsi))
+	    && gimple_vdef (gsi_stmt (gsi)))
+	  {
+	    gimple new_stmt, lhs_stmt, rhs_stmt;
+	    gimple stmt = gsi_stmt (gsi);
+	    tree lhs = gimple_get_lhs (stmt);
+	    tree rhs = gimple_op (stmt, 1);
+
+	    gcc_assert (gimple_num_ops (stmt) == 2);
+
+	    rhs_stmt = ifc_temp_var (TREE_TYPE (rhs), unshare_expr (rhs));
+	    gsi_insert_before (&gsi, rhs_stmt, GSI_SAME_STMT);
+	    rhs = gimple_get_lhs (rhs_stmt);
+
+	    lhs_stmt = ifc_temp_var (TREE_TYPE (lhs), unshare_expr (lhs));
+	    gsi_insert_before (&gsi, lhs_stmt, GSI_SAME_STMT);
+	    lhs = gimple_get_lhs (lhs_stmt);
+
+	    cond = unshare_expr (cond);
+	    rhs = build3 (COND_EXPR, TREE_TYPE (lhs), cond, rhs, lhs);
+
+	    new_stmt = ifc_temp_var (TREE_TYPE (lhs), rhs);
+	    gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
+	    gimple_set_op (stmt, 1, gimple_get_lhs (new_stmt));
+	    update_stmt (stmt);
+	  }
+    }
+}
+
 /* Remove all GIMPLE_CONDs and GIMPLE_LABELs of all the basic blocks
    other than the exit and latch of the LOOP.  Also resets the
    GIMPLE_DEBUG information.  */
@@ -1178,7 +1270,10 @@  combine_blocks (struct loop *loop)
 
   remove_conditions_and_labels (loop);
   insert_gimplified_predicates (loop);
-  ifconvert_phi_nodes (loop);
+  predicate_all_scalar_phis (loop);
+
+  if (flag_tree_loop_if_convert_memory_writes)
+    predicate_mem_writes (loop);
 
   /* Merge basic blocks: first remove all the edges in the loop,
      except for those from the exit block.  */
@@ -1280,6 +1375,10 @@  tree_if_conversion (struct loop *loop)
      blocks into one huge basic block doing the if-conversion
      on-the-fly.  */
   combine_blocks (loop);
+
+  if (flag_tree_loop_if_convert_memory_writes)
+    mark_sym_for_renaming (gimple_vop (cfun));
+
   changed = true;
 
  cleanup:
@@ -1312,6 +1411,9 @@  main_tree_if_conversion (void)
   FOR_EACH_LOOP (li, loop, 0)
     changed |= tree_if_conversion (loop);
 
+  if (changed && flag_tree_loop_if_convert_memory_writes)
+    update_ssa (TODO_update_ssa);
+
   return changed ? TODO_cleanup_cfg : 0;
 }
 
@@ -1321,7 +1423,8 @@  static bool
 gate_tree_if_conversion (void)
 {
   return ((flag_tree_vectorize && flag_tree_loop_if_convert != 0)
-	  || flag_tree_loop_if_convert == 1);
+	  || flag_tree_loop_if_convert == 1
+	  || flag_tree_loop_if_convert_memory_writes == 1);
 }
 
 struct gimple_opt_pass pass_if_conversion =