Patchwork Hoist adjacent pointer loads

login
register
mail settings
Submitter William J. Schmidt
Date May 22, 2012, 4:36 p.m.
Message ID <1337704565.21356.6.camel@gnopaine>
Download mbox | patch
Permalink /patch/160688/
State New
Headers show

Comments

William J. Schmidt - May 22, 2012, 4:36 p.m.
Here's a revision of the hoist-adjacent-loads patch.  Besides hopefully
addressing all your comments, I added a gate of at least -O2 for this
transformation.  Let me know if you prefer a different minimum opt
level.

I'm still running SPEC tests to make sure there are no regressions when
opening this up to non-pointer arguments.  The code bootstraps on
powerpc64-unknown-linux-gnu with no regressions.  Assuming the SPEC
numbers come out as expected, is this ok?

Thanks,
Bill


2012-05-22  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
	declaration.
	(hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
	(tree_ssa_phiopt): Call gate_hoist_loads.
	(tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
	(tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
	hoist_adjacent_loads.
	(local_mem_dependence): New function.
	(hoist_adjacent_loads): Likewise.
	(gate_hoist_loads): Likewise.
	* common.opt (fhoist-adjacent-loads): New switch.
	* Makefile.in (tree-ssa-phiopt.o): Added dependencies.
	* params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
Richard Guenther - May 23, 2012, 11:25 a.m.
On Tue, 22 May 2012, William J. Schmidt wrote:

> Here's a revision of the hoist-adjacent-loads patch.  Besides hopefully
> addressing all your comments, I added a gate of at least -O2 for this
> transformation.  Let me know if you prefer a different minimum opt
> level.
> 
> I'm still running SPEC tests to make sure there are no regressions when
> opening this up to non-pointer arguments.  The code bootstraps on
> powerpc64-unknown-linux-gnu with no regressions.  Assuming the SPEC
> numbers come out as expected, is this ok?
> 
> Thanks,
> Bill
> 
> 
> 2012-05-22  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> 
> 	* tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
> 	declaration.
> 	(hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
> 	(tree_ssa_phiopt): Call gate_hoist_loads.
> 	(tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
> 	(tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
> 	hoist_adjacent_loads.
> 	(local_mem_dependence): New function.
> 	(hoist_adjacent_loads): Likewise.
> 	(gate_hoist_loads): Likewise.
> 	* common.opt (fhoist-adjacent-loads): New switch.
> 	* Makefile.in (tree-ssa-phiopt.o): Added dependencies.
> 	* params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
> 
> 
> Index: gcc/tree-ssa-phiopt.c
> ===================================================================
> --- gcc/tree-ssa-phiopt.c	(revision 187728)
> +++ gcc/tree-ssa-phiopt.c	(working copy)
> @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
>  #include "cfgloop.h"
>  #include "tree-data-ref.h"
>  #include "tree-pretty-print.h"
> +#include "gimple-pretty-print.h"
> +#include "insn-config.h"
> +#include "expr.h"
> +#include "optabs.h"
>  
> +#ifndef HAVE_conditional_move
> +#define HAVE_conditional_move (0)
> +#endif
> +
>  static unsigned int tree_ssa_phiopt (void);
> -static unsigned int tree_ssa_phiopt_worker (bool);
> +static unsigned int tree_ssa_phiopt_worker (bool, bool);
>  static bool conditional_replacement (basic_block, basic_block,
>  				     edge, edge, gimple, tree, tree);
>  static int value_replacement (basic_block, basic_block,
> @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
>  static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
>  static struct pointer_set_t * get_non_trapping (void);
>  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> +static void hoist_adjacent_loads (basic_block, basic_block,
> +				  basic_block, basic_block);
> +static bool gate_hoist_loads (void);
>  
>  /* This pass tries to replaces an if-then-else block with an
>     assignment.  We have four kinds of transformations.  Some of these
> @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
>       bb2:
>         x = PHI <x' (bb0), ...>;
>  
> -   A similar transformation is done for MAX_EXPR.  */
> +   A similar transformation is done for MAX_EXPR.
>  
> +
> +   This pass also performs a fifth transformation of a slightly different
> +   flavor.
> +
> +   Adjacent Load Hoisting
> +   ----------------------
> +   
> +   This transformation replaces
> +
> +     bb0:
> +       if (...) goto bb2; else goto bb1;
> +     bb1:
> +       x1 = (<expr>).field1;
> +       goto bb3;
> +     bb2:
> +       x2 = (<expr>).field2;
> +     bb3:
> +       # x = PHI <x1, x2>;
> +
> +   with
> +
> +     bb0:
> +       x1 = (<expr>).field1;
> +       x2 = (<expr>).field2;
> +       if (...) goto bb2; else goto bb1;
> +     bb1:
> +       goto bb3;
> +     bb2:
> +     bb3:
> +       # x = PHI <x1, x2>;
> +
> +   The purpose of this transformation is to enable generation of conditional
> +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
> +   the loads is speculative, the transformation is restricted to very
> +   specific cases to avoid introducing a page fault.  We are looking for
> +   the common idiom:
> +
> +     if (...)
> +       x = y->left;
> +     else
> +       x = y->right;
> +
> +   where left and right are typically adjacent pointers in a tree structure.  */
> +
>  static unsigned int
>  tree_ssa_phiopt (void)
>  {
> -  return tree_ssa_phiopt_worker (false);
> +  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
>  }
>  
>  /* This pass tries to transform conditional stores into unconditional
> @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
>  static unsigned int
>  tree_ssa_cs_elim (void)
>  {
> -  return tree_ssa_phiopt_worker (true);
> +  return tree_ssa_phiopt_worker (true, false);
>  }
>  
>  /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> @@ -227,9 +282,11 @@ static tree condstoretemp;
>  /* The core routine of conditional store replacement and normal
>     phi optimizations.  Both share much of the infrastructure in how
>     to match applicable basic block patterns.  DO_STORE_ELIM is true
> -   when we want to do conditional store replacement, false otherwise.  */
> +   when we want to do conditional store replacement, false otherwise.
> +   DO_HOIST_LOADS is true when we want to hoist adjacent loads out 
> +   of diamond control flow patterns, false otherwise.  */
>  static unsigned int
> -tree_ssa_phiopt_worker (bool do_store_elim)
> +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
>  {
>    basic_block bb;
>    basic_block *bb_order;
> @@ -312,6 +369,20 @@ static unsigned int
>  	    cfgchanged = true;
>  	  continue;
>  	}
> +      else if (do_hoist_loads
> +		 && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> +	{
> +	  basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> +
> +	  if (single_succ_p (bb1)
> +	      && single_succ_p (bb2)
> +	      && single_pred_p (bb1)
> +	      && single_pred_p (bb2)
> +	      && EDGE_COUNT (bb->succs) == 2
> +	      && EDGE_COUNT (bb3->preds) == 2)
> +	    hoist_adjacent_loads (bb, bb1, bb2, bb3);
> +	  continue;
> +	}
>        else
>  	continue;      
>  
> @@ -1707,6 +1778,206 @@ cond_if_else_store_replacement (basic_block then_b
>    return ok;
>  }
>  
> +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
> +
> +static bool
> +local_mem_dependence (gimple stmt, basic_block bb)
> +{
> +  tree vuse = gimple_vuse (stmt);
> +  gimple def;
> +
> +  if (!vuse)
> +    return false;
> +
> +  def = SSA_NAME_DEF_STMT (vuse);
> +  return (def && gimple_bb (def) == bb);
> +}
> +
> +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> +   BB1 and BB2 are "then" and "else" blocks dependent on this test,
> +   and BB3 rejoins control flow following BB1 and BB2, look for 
> +   opportunities to hoist loads as follows.  If BB3 contains a PHI of
> +   two loads, one each occurring in BB1 and BB2, and the loads are
> +   provably of adjacent fields in the same structure, then move both
> +   loads into BB0.  Of course this can only be done if there are no
> +   dependencies preventing such motion.
> +
> +   One of the hoisted loads will always be speculative, so the
> +   transformation is currently conservative:
> +
> +    - The fields must be strictly adjacent.
> +    - The two fields must occupy a single memory block that is
> +      guaranteed to not cross a page boundary.
> +
> +    The last is difficult to prove, as such memory blocks should be
> +    aligned on the minimum of the stack alignment boundary and the
> +    alignment guaranteed by heap allocation interfaces.  Thus we rely
> +    on a parameter for the alignment value.
> +
> +    Provided a good value is used for the last case, the first
> +    restriction could possibly be relaxed.  */
> +
> +static void
> +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> +		      basic_block bb2, basic_block bb3)
> +{
> +  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> +  gimple_stmt_iterator gsi;
> +
> +  /* Walk the phis in bb3 looking for an opportunity.  We are looking
> +     for phis of two SSA names, one each of which is defined in bb1 and
> +     bb2.  */
> +  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> +    {
> +      gimple phi_stmt = gsi_stmt (gsi);
> +      gimple def1, def2, defswap;
> +      tree arg1, arg2, ref1, ref2, field1, field2, fieldswap;
> +      tree tree_offset1, tree_offset2, tree_size2, next;
> +      int offset1, offset2, size2, block_offset;
> +      unsigned align1;
> +      gimple_stmt_iterator gsi2;
> +
> +      if (gimple_phi_num_args (phi_stmt) != 2)
> +	continue;
> +
> +      arg1 = gimple_phi_arg_def (phi_stmt, 0);
> +      arg2 = gimple_phi_arg_def (phi_stmt, 1);
> +      
> +      if (TREE_CODE (arg1) != SSA_NAME
> +	  || TREE_CODE (arg2) != SSA_NAME
> +	  || SSA_NAME_IS_DEFAULT_DEF (arg1)
> +	  || SSA_NAME_IS_DEFAULT_DEF (arg2))
> +	continue;
> +
> +      def1 = SSA_NAME_DEF_STMT (arg1);
> +      def2 = SSA_NAME_DEF_STMT (arg2);
> +
> +      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> +	  && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> +	continue;
> +
> +      /* Unless -fhoist-adjacent-loads was specified, check the mode
> +	 of the arguments to be sure a conditional move can be generated
> +	 for it.  */
> +      if (flag_hoist_adjacent_loads != 1
> +	  && !optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> +	continue;
> +
> +      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
> +      if (!gimple_assign_single_p (def1)
> +	  || !gimple_assign_single_p (def2))
> +	continue;
> +
> +      ref1 = gimple_assign_rhs1 (def1);
> +      ref2 = gimple_assign_rhs1 (def2);
> +
> +      if (TREE_CODE (ref1) != COMPONENT_REF
> +	  || TREE_CODE (ref2) != COMPONENT_REF)
> +	continue;
> +
> +      /* The zeroth operand of the two component references must be
> +	 identical.  It is not sufficient to compare get_base_address of
> +	 the two references, because this could allow for different
> +	 elements of the same array in the two trees.  It is not safe to
> +	 assume that the existence of one array element implies the
> +	 existence of a different one.  */
> +      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> +	continue;
> +
> +      field1 = TREE_OPERAND (ref1, 1);
> +      field2 = TREE_OPERAND (ref2, 1);
> +
> +      /* Check for field adjacency, and ensure field1 comes first.  */
> +      for (next = DECL_CHAIN (field1);
> +	   next && TREE_CODE (next) != FIELD_DECL;
> +	   next = DECL_CHAIN (next))
> +	;
> +
> +      if (!next || !operand_equal_p (next, field2, 0))

You can compare FIELD_DECLS by pointer equality, thus next != field2 here.

> +	{
> +	  for (next = DECL_CHAIN (field2);
> +	       next && TREE_CODE (next) != FIELD_DECL;
> +	       next = DECL_CHAIN (next))
> +	    ;
> +
> +	  if (!next || !operand_equal_p (next, field1, 0))

Likewise.

> +	    continue;
> +
> +	  fieldswap = field1;
> +	  field1 = field2;
> +	  field2 = fieldswap;
> +	  defswap = def1;
> +	  def1 = def2;
> +	  def2 = defswap;
> +	}
> +
> +      /* Check for proper alignment of the first field.  */
> +      tree_offset1 = bit_position (field1);
> +      tree_offset2 = bit_position (field2);
> +      tree_size2 = DECL_SIZE_UNIT (field2);
> +
> +      if (TREE_CODE (tree_size2) != INTEGER_CST

host_integerp already checks for the TREE_CODE.

> +	  || !host_integerp (tree_offset1, 0)
> +	  || !host_integerp (tree_offset2, 0)
> +	  || !host_integerp (tree_size2, 0))
> +	continue;

And I think you want , 1 in all cases, positive offsets/sizes.

> +      offset1 = TREE_INT_CST_LOW (tree_offset1);
> +      offset2 = TREE_INT_CST_LOW (tree_offset2);
> +      size2 = TREE_INT_CST_LOW (tree_size2);
> +      align1 = DECL_ALIGN (field1);
> +
> +      if (offset1 % BITS_PER_UNIT != 0 || offset1 % align1 != 0)
> +	continue;

Why offset1 % align1 != 0?  offset1 is relative to the containing
record start while DECL_ALIGN is relative to the ultimate containing
object.  Either it's redundant to check that or overly conservative.
I think what you want to check is that align1 is properly large
so you can guarantee that both fields are inside a single cache
line, thus, (offset2 - offset1 + size2) <= align1

> +      offset1 /= BITS_PER_UNIT;
> +      offset2 /= BITS_PER_UNIT;
> +
> +      /* The two field references must fit within a block of memory that
> +	 is guaranteed to be on the same page.  */
> +      block_offset = offset1 % param_align;

See above - offset1 has no interesting meaning.

> +
> +      if (block_offset + offset2 - offset1 + size2 > param_align)
> +	continue;
> +      /* The two expressions cannot be dependent upon vdefs defined
> +	 in bb1/bb2.  */
> +      if (local_mem_dependence (def1, bb1) || local_mem_dependence (def2, bb2))
> +	continue;
> +
> +      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> +	 bb0.  We hoist the first one first so that a cache miss is handled
> +         efficiently regardless of hardware cache-fill policy.  */
> +      gsi2 = gsi_for_stmt (def1);
> +      gsi_move_to_bb_end (&gsi2, bb0);
> +      gsi2 = gsi_for_stmt (def2);
> +      gsi_move_to_bb_end (&gsi2, bb0);
> +
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +	{
> +	  fprintf (dump_file,
> +		   "\nHoisting adjacent loads from %d and %d into %d: \n",
> +		   bb1->index, bb2->index, bb0->index);
> +	  print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> +	  print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> +	}
> +    }
> +}
> +
> +/* Determine whether we should attempt to hoist adjacent loads out of
> +   diamond patterns in pass_phiopt.  Always hoist loads if
> +   -fhoist-adjacent-loads is specified, or if the target machine has
> +   a conditional move instruction and -fno-hoist-adjacent-loads is
> +   not specified.  */
> +
> +static bool
> +gate_hoist_loads (void)
> +{
> +  return ((optimize > 1)
> +	  && (flag_hoist_adjacent_loads == 1
> +	      || (HAVE_conditional_move && flag_hoist_adjacent_loads != 0)));

Rather than check optimize > 1 here properly initialize it with 1 and
put it into the default_options_table.

Thanks,
Richard.

> +}
> +
>  /* Always do these optimizations if we have SSA
>     trees to work on.  */
>  static bool
> Index: gcc/common.opt
> ===================================================================
> --- gcc/common.opt	(revision 187728)
> +++ gcc/common.opt	(working copy)
> @@ -1186,6 +1186,11 @@ fgraphite-identity
>  Common Report Var(flag_graphite_identity) Optimization
>  Enable Graphite Identity transformation
>  
> +fhoist-adjacent-loads
> +Common Report Var(flag_hoist_adjacent_loads) Init(-1) Optimization
> +Enable hoisting adjacent loads to encourage generating conditional move
> +instructions
> +
>  floop-parallelize-all
>  Common Report Var(flag_loop_parallelize_all) Optimization
>  Mark all loops as parallel
> Index: gcc/Makefile.in
> ===================================================================
> --- gcc/Makefile.in	(revision 187728)
> +++ gcc/Makefile.in	(working copy)
> @@ -2355,7 +2355,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
>     $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
>     $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
>     $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> -   $(TREE_DATA_REF_H) tree-pretty-print.h
> +   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> +   insn-config.h $(EXPR_H) $(OPTABS_H)
>  tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
>     $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
>     $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> Index: gcc/params.def
> ===================================================================
> --- gcc/params.def	(revision 187728)
> +++ gcc/params.def	(working copy)
> @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
>  	  "track string lengths",
>  	  1000, 0, 0)
>  
> +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
> +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> +	  "min-cmove-struct-align",
> +	  "Minimum byte alignment to assume for structures in the stack "
> +	  "or heap when considering load hoisting for conditional moves",
> +	  16, 8, 256)
> +
>  /*
>  Local variables:
>  mode:c
> 
> 
>
William J. Schmidt - May 24, 2012, 4:12 a.m.
On Wed, 2012-05-23 at 13:25 +0200, Richard Guenther wrote:
> On Tue, 22 May 2012, William J. Schmidt wrote:
> 
> > Here's a revision of the hoist-adjacent-loads patch.  Besides hopefully
> > addressing all your comments, I added a gate of at least -O2 for this
> > transformation.  Let me know if you prefer a different minimum opt
> > level.
> > 
> > I'm still running SPEC tests to make sure there are no regressions when
> > opening this up to non-pointer arguments.  The code bootstraps on
> > powerpc64-unknown-linux-gnu with no regressions.  Assuming the SPEC
> > numbers come out as expected, is this ok?
> > 
> > Thanks,
> > Bill
> > 
> > 
> > 2012-05-22  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>
> > 
> > 	* tree-ssa-phiopt.c (tree_ssa_phiopt_worker): Add argument to forward
> > 	declaration.
> > 	(hoist_adjacent_loads, gate_hoist_loads): New forward declarations.
> > 	(tree_ssa_phiopt): Call gate_hoist_loads.
> > 	(tree_ssa_cs_elim): Add parm to tree_ssa_phiopt_worker call.
> > 	(tree_ssa_phiopt_worker): Add do_hoist_loads to formal arg list; call
> > 	hoist_adjacent_loads.
> > 	(local_mem_dependence): New function.
> > 	(hoist_adjacent_loads): Likewise.
> > 	(gate_hoist_loads): Likewise.
> > 	* common.opt (fhoist-adjacent-loads): New switch.
> > 	* Makefile.in (tree-ssa-phiopt.o): Added dependencies.
> > 	* params.def (PARAM_MIN_CMOVE_STRUCT_ALIGN): New param.
> > 
> > 
> > Index: gcc/tree-ssa-phiopt.c
> > ===================================================================
> > --- gcc/tree-ssa-phiopt.c	(revision 187728)
> > +++ gcc/tree-ssa-phiopt.c	(working copy)
> > @@ -37,9 +37,17 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "cfgloop.h"
> >  #include "tree-data-ref.h"
> >  #include "tree-pretty-print.h"
> > +#include "gimple-pretty-print.h"
> > +#include "insn-config.h"
> > +#include "expr.h"
> > +#include "optabs.h"
> >  
> > +#ifndef HAVE_conditional_move
> > +#define HAVE_conditional_move (0)
> > +#endif
> > +
> >  static unsigned int tree_ssa_phiopt (void);
> > -static unsigned int tree_ssa_phiopt_worker (bool);
> > +static unsigned int tree_ssa_phiopt_worker (bool, bool);
> >  static bool conditional_replacement (basic_block, basic_block,
> >  				     edge, edge, gimple, tree, tree);
> >  static int value_replacement (basic_block, basic_block,
> > @@ -53,6 +61,9 @@ static bool cond_store_replacement (basic_block, b
> >  static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
> >  static struct pointer_set_t * get_non_trapping (void);
> >  static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
> > +static void hoist_adjacent_loads (basic_block, basic_block,
> > +				  basic_block, basic_block);
> > +static bool gate_hoist_loads (void);
> >  
> >  /* This pass tries to replaces an if-then-else block with an
> >     assignment.  We have four kinds of transformations.  Some of these
> > @@ -138,12 +149,56 @@ static void replace_phi_edge_with_variable (basic_
> >       bb2:
> >         x = PHI <x' (bb0), ...>;
> >  
> > -   A similar transformation is done for MAX_EXPR.  */
> > +   A similar transformation is done for MAX_EXPR.
> >  
> > +
> > +   This pass also performs a fifth transformation of a slightly different
> > +   flavor.
> > +
> > +   Adjacent Load Hoisting
> > +   ----------------------
> > +   
> > +   This transformation replaces
> > +
> > +     bb0:
> > +       if (...) goto bb2; else goto bb1;
> > +     bb1:
> > +       x1 = (<expr>).field1;
> > +       goto bb3;
> > +     bb2:
> > +       x2 = (<expr>).field2;
> > +     bb3:
> > +       # x = PHI <x1, x2>;
> > +
> > +   with
> > +
> > +     bb0:
> > +       x1 = (<expr>).field1;
> > +       x2 = (<expr>).field2;
> > +       if (...) goto bb2; else goto bb1;
> > +     bb1:
> > +       goto bb3;
> > +     bb2:
> > +     bb3:
> > +       # x = PHI <x1, x2>;
> > +
> > +   The purpose of this transformation is to enable generation of conditional
> > +   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
> > +   the loads is speculative, the transformation is restricted to very
> > +   specific cases to avoid introducing a page fault.  We are looking for
> > +   the common idiom:
> > +
> > +     if (...)
> > +       x = y->left;
> > +     else
> > +       x = y->right;
> > +
> > +   where left and right are typically adjacent pointers in a tree structure.  */
> > +
> >  static unsigned int
> >  tree_ssa_phiopt (void)
> >  {
> > -  return tree_ssa_phiopt_worker (false);
> > +  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
> >  }
> >  
> >  /* This pass tries to transform conditional stores into unconditional
> > @@ -190,7 +245,7 @@ tree_ssa_phiopt (void)
> >  static unsigned int
> >  tree_ssa_cs_elim (void)
> >  {
> > -  return tree_ssa_phiopt_worker (true);
> > +  return tree_ssa_phiopt_worker (true, false);
> >  }
> >  
> >  /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
> > @@ -227,9 +282,11 @@ static tree condstoretemp;
> >  /* The core routine of conditional store replacement and normal
> >     phi optimizations.  Both share much of the infrastructure in how
> >     to match applicable basic block patterns.  DO_STORE_ELIM is true
> > -   when we want to do conditional store replacement, false otherwise.  */
> > +   when we want to do conditional store replacement, false otherwise.
> > +   DO_HOIST_LOADS is true when we want to hoist adjacent loads out 
> > +   of diamond control flow patterns, false otherwise.  */
> >  static unsigned int
> > -tree_ssa_phiopt_worker (bool do_store_elim)
> > +tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
> >  {
> >    basic_block bb;
> >    basic_block *bb_order;
> > @@ -312,6 +369,20 @@ static unsigned int
> >  	    cfgchanged = true;
> >  	  continue;
> >  	}
> > +      else if (do_hoist_loads
> > +		 && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
> > +	{
> > +	  basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
> > +
> > +	  if (single_succ_p (bb1)
> > +	      && single_succ_p (bb2)
> > +	      && single_pred_p (bb1)
> > +	      && single_pred_p (bb2)
> > +	      && EDGE_COUNT (bb->succs) == 2
> > +	      && EDGE_COUNT (bb3->preds) == 2)
> > +	    hoist_adjacent_loads (bb, bb1, bb2, bb3);
> > +	  continue;
> > +	}
> >        else
> >  	continue;      
> >  
> > @@ -1707,6 +1778,206 @@ cond_if_else_store_replacement (basic_block then_b
> >    return ok;
> >  }
> >  
> > +/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
> > +
> > +static bool
> > +local_mem_dependence (gimple stmt, basic_block bb)
> > +{
> > +  tree vuse = gimple_vuse (stmt);
> > +  gimple def;
> > +
> > +  if (!vuse)
> > +    return false;
> > +
> > +  def = SSA_NAME_DEF_STMT (vuse);
> > +  return (def && gimple_bb (def) == bb);
> > +}
> > +
> > +/* Given a "diamond" control-flow pattern where BB0 tests a condition,
> > +   BB1 and BB2 are "then" and "else" blocks dependent on this test,
> > +   and BB3 rejoins control flow following BB1 and BB2, look for 
> > +   opportunities to hoist loads as follows.  If BB3 contains a PHI of
> > +   two loads, one each occurring in BB1 and BB2, and the loads are
> > +   provably of adjacent fields in the same structure, then move both
> > +   loads into BB0.  Of course this can only be done if there are no
> > +   dependencies preventing such motion.
> > +
> > +   One of the hoisted loads will always be speculative, so the
> > +   transformation is currently conservative:
> > +
> > +    - The fields must be strictly adjacent.
> > +    - The two fields must occupy a single memory block that is
> > +      guaranteed to not cross a page boundary.
> > +
> > +    The last is difficult to prove, as such memory blocks should be
> > +    aligned on the minimum of the stack alignment boundary and the
> > +    alignment guaranteed by heap allocation interfaces.  Thus we rely
> > +    on a parameter for the alignment value.
> > +
> > +    Provided a good value is used for the last case, the first
> > +    restriction could possibly be relaxed.  */
> > +
> > +static void
> > +hoist_adjacent_loads (basic_block bb0, basic_block bb1,
> > +		      basic_block bb2, basic_block bb3)
> > +{
> > +  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
> > +  gimple_stmt_iterator gsi;
> > +
> > +  /* Walk the phis in bb3 looking for an opportunity.  We are looking
> > +     for phis of two SSA names, one each of which is defined in bb1 and
> > +     bb2.  */
> > +  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
> > +    {
> > +      gimple phi_stmt = gsi_stmt (gsi);
> > +      gimple def1, def2, defswap;
> > +      tree arg1, arg2, ref1, ref2, field1, field2, fieldswap;
> > +      tree tree_offset1, tree_offset2, tree_size2, next;
> > +      int offset1, offset2, size2, block_offset;
> > +      unsigned align1;
> > +      gimple_stmt_iterator gsi2;
> > +
> > +      if (gimple_phi_num_args (phi_stmt) != 2)
> > +	continue;
> > +
> > +      arg1 = gimple_phi_arg_def (phi_stmt, 0);
> > +      arg2 = gimple_phi_arg_def (phi_stmt, 1);
> > +      
> > +      if (TREE_CODE (arg1) != SSA_NAME
> > +	  || TREE_CODE (arg2) != SSA_NAME
> > +	  || SSA_NAME_IS_DEFAULT_DEF (arg1)
> > +	  || SSA_NAME_IS_DEFAULT_DEF (arg2))
> > +	continue;
> > +
> > +      def1 = SSA_NAME_DEF_STMT (arg1);
> > +      def2 = SSA_NAME_DEF_STMT (arg2);
> > +
> > +      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
> > +	  && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
> > +	continue;
> > +
> > +      /* Unless -fhoist-adjacent-loads was specified, check the mode
> > +	 of the arguments to be sure a conditional move can be generated
> > +	 for it.  */
> > +      if (flag_hoist_adjacent_loads != 1
> > +	  && !optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
> > +	continue;
> > +
> > +      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
> > +      if (!gimple_assign_single_p (def1)
> > +	  || !gimple_assign_single_p (def2))
> > +	continue;
> > +
> > +      ref1 = gimple_assign_rhs1 (def1);
> > +      ref2 = gimple_assign_rhs1 (def2);
> > +
> > +      if (TREE_CODE (ref1) != COMPONENT_REF
> > +	  || TREE_CODE (ref2) != COMPONENT_REF)
> > +	continue;
> > +
> > +      /* The zeroth operand of the two component references must be
> > +	 identical.  It is not sufficient to compare get_base_address of
> > +	 the two references, because this could allow for different
> > +	 elements of the same array in the two trees.  It is not safe to
> > +	 assume that the existence of one array element implies the
> > +	 existence of a different one.  */
> > +      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
> > +	continue;
> > +
> > +      field1 = TREE_OPERAND (ref1, 1);
> > +      field2 = TREE_OPERAND (ref2, 1);
> > +
> > +      /* Check for field adjacency, and ensure field1 comes first.  */
> > +      for (next = DECL_CHAIN (field1);
> > +	   next && TREE_CODE (next) != FIELD_DECL;
> > +	   next = DECL_CHAIN (next))
> > +	;
> > +
> > +      if (!next || !operand_equal_p (next, field2, 0))
> 
> You can compare FIELD_DECLS by pointer equality, thus next != field2 here.
> 
> > +	{
> > +	  for (next = DECL_CHAIN (field2);
> > +	       next && TREE_CODE (next) != FIELD_DECL;
> > +	       next = DECL_CHAIN (next))
> > +	    ;
> > +
> > +	  if (!next || !operand_equal_p (next, field1, 0))
> 
> Likewise.
> 
> > +	    continue;
> > +
> > +	  fieldswap = field1;
> > +	  field1 = field2;
> > +	  field2 = fieldswap;
> > +	  defswap = def1;
> > +	  def1 = def2;
> > +	  def2 = defswap;
> > +	}
> > +
> > +      /* Check for proper alignment of the first field.  */
> > +      tree_offset1 = bit_position (field1);
> > +      tree_offset2 = bit_position (field2);
> > +      tree_size2 = DECL_SIZE_UNIT (field2);
> > +
> > +      if (TREE_CODE (tree_size2) != INTEGER_CST
> 
> host_integerp already checks for the TREE_CODE.
> 
> > +	  || !host_integerp (tree_offset1, 0)
> > +	  || !host_integerp (tree_offset2, 0)
> > +	  || !host_integerp (tree_size2, 0))
> > +	continue;
> 
> And I think you want , 1 in all cases, positive offsets/sizes.
> 
> > +      offset1 = TREE_INT_CST_LOW (tree_offset1);
> > +      offset2 = TREE_INT_CST_LOW (tree_offset2);
> > +      size2 = TREE_INT_CST_LOW (tree_size2);
> > +      align1 = DECL_ALIGN (field1);
> > +
> > +      if (offset1 % BITS_PER_UNIT != 0 || offset1 % align1 != 0)
> > +	continue;
> 
> Why offset1 % align1 != 0?  offset1 is relative to the containing
> record start while DECL_ALIGN is relative to the ultimate containing
> object.  Either it's redundant to check that or overly conservative.
> I think what you want to check is that align1 is properly large
> so you can guarantee that both fields are inside a single cache
> line, thus, (offset2 - offset1 + size2) <= align1

Hm, I think really what we want is to just look within the block size
established by param_align...something like:

      offset1 = TREE_INT_CST_LOW (tree_offset1);
      offset2 = TREE_INT_CST_LOW (tree_offset2);
      size2 = TREE_INT_CST_LOW (tree_size2);
      align1 = DECL_ALIGN (field1) % (param_align * BITS_PER_UNIT);

      if (offset1 % BITS_PER_UNIT != 0)
	continue;

      /* The two field references must fit within a block of memory that
	 is guaranteed to be on the same page.  */
      if (align1 + offset2 - offset1 + size2 > param_align * BITS_PER_UNIT)
	continue;

I.e., use the DECL_ALIGN to find where within the block we are probably
starting, then make sure the extent of the two objects doesn't run past
the end of the block.  Sound right?

> 
> > +      offset1 /= BITS_PER_UNIT;
> > +      offset2 /= BITS_PER_UNIT;
> > +
> > +      /* The two field references must fit within a block of memory that
> > +	 is guaranteed to be on the same page.  */
> > +      block_offset = offset1 % param_align;
> 
> See above - offset1 has no interesting meaning.
> 
> > +
> > +      if (block_offset + offset2 - offset1 + size2 > param_align)
> > +	continue;
> > +      /* The two expressions cannot be dependent upon vdefs defined
> > +	 in bb1/bb2.  */
> > +      if (local_mem_dependence (def1, bb1) || local_mem_dependence (def2, bb2))
> > +	continue;
> > +
> > +      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
> > +	 bb0.  We hoist the first one first so that a cache miss is handled
> > +         efficiently regardless of hardware cache-fill policy.  */
> > +      gsi2 = gsi_for_stmt (def1);
> > +      gsi_move_to_bb_end (&gsi2, bb0);
> > +      gsi2 = gsi_for_stmt (def2);
> > +      gsi_move_to_bb_end (&gsi2, bb0);
> > +
> > +      if (dump_file && (dump_flags & TDF_DETAILS))
> > +	{
> > +	  fprintf (dump_file,
> > +		   "\nHoisting adjacent loads from %d and %d into %d: \n",
> > +		   bb1->index, bb2->index, bb0->index);
> > +	  print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
> > +	  print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
> > +	}
> > +    }
> > +}
> > +
> > +/* Determine whether we should attempt to hoist adjacent loads out of
> > +   diamond patterns in pass_phiopt.  Always hoist loads if
> > +   -fhoist-adjacent-loads is specified, or if the target machine has
> > +   a conditional move instruction and -fno-hoist-adjacent-loads is
> > +   not specified.  */
> > +
> > +static bool
> > +gate_hoist_loads (void)
> > +{
> > +  return ((optimize > 1)
> > +	  && (flag_hoist_adjacent_loads == 1
> > +	      || (HAVE_conditional_move && flag_hoist_adjacent_loads != 0)));
> 
> Rather than check optimize > 1 here properly initialize it with 1 and
> put it into the default_options_table.
> 
> Thanks,
> Richard.
> 
> > +}
> > +
> >  /* Always do these optimizations if we have SSA
> >     trees to work on.  */
> >  static bool
> > Index: gcc/common.opt
> > ===================================================================
> > --- gcc/common.opt	(revision 187728)
> > +++ gcc/common.opt	(working copy)
> > @@ -1186,6 +1186,11 @@ fgraphite-identity
> >  Common Report Var(flag_graphite_identity) Optimization
> >  Enable Graphite Identity transformation
> >  
> > +fhoist-adjacent-loads
> > +Common Report Var(flag_hoist_adjacent_loads) Init(-1) Optimization
> > +Enable hoisting adjacent loads to encourage generating conditional move
> > +instructions
> > +
> >  floop-parallelize-all
> >  Common Report Var(flag_loop_parallelize_all) Optimization
> >  Mark all loops as parallel
> > Index: gcc/Makefile.in
> > ===================================================================
> > --- gcc/Makefile.in	(revision 187728)
> > +++ gcc/Makefile.in	(working copy)
> > @@ -2355,7 +2355,8 @@ tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
> >     $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
> >     $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
> >     $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
> > -   $(TREE_DATA_REF_H) tree-pretty-print.h
> > +   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
> > +   insn-config.h $(EXPR_H) $(OPTABS_H)
> >  tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
> >     $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
> >     $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
> > Index: gcc/params.def
> > ===================================================================
> > --- gcc/params.def	(revision 187728)
> > +++ gcc/params.def	(working copy)
> > @@ -979,6 +979,13 @@ DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
> >  	  "track string lengths",
> >  	  1000, 0, 0)
> >  
> > +/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
> > +DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
> > +	  "min-cmove-struct-align",
> > +	  "Minimum byte alignment to assume for structures in the stack "
> > +	  "or heap when considering load hoisting for conditional moves",
> > +	  16, 8, 256)
> > +
> >  /*
> >  Local variables:
> >  mode:c
> > 
> > 
> > 
>

Patch

Index: gcc/tree-ssa-phiopt.c
===================================================================
--- gcc/tree-ssa-phiopt.c	(revision 187728)
+++ gcc/tree-ssa-phiopt.c	(working copy)
@@ -37,9 +37,17 @@  along with GCC; see the file COPYING3.  If not see
 #include "cfgloop.h"
 #include "tree-data-ref.h"
 #include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#include "insn-config.h"
+#include "expr.h"
+#include "optabs.h"
 
+#ifndef HAVE_conditional_move
+#define HAVE_conditional_move (0)
+#endif
+
 static unsigned int tree_ssa_phiopt (void);
-static unsigned int tree_ssa_phiopt_worker (bool);
+static unsigned int tree_ssa_phiopt_worker (bool, bool);
 static bool conditional_replacement (basic_block, basic_block,
 				     edge, edge, gimple, tree, tree);
 static int value_replacement (basic_block, basic_block,
@@ -53,6 +61,9 @@  static bool cond_store_replacement (basic_block, b
 static bool cond_if_else_store_replacement (basic_block, basic_block, basic_block);
 static struct pointer_set_t * get_non_trapping (void);
 static void replace_phi_edge_with_variable (basic_block, edge, gimple, tree);
+static void hoist_adjacent_loads (basic_block, basic_block,
+				  basic_block, basic_block);
+static bool gate_hoist_loads (void);
 
 /* This pass tries to replaces an if-then-else block with an
    assignment.  We have four kinds of transformations.  Some of these
@@ -138,12 +149,56 @@  static void replace_phi_edge_with_variable (basic_
      bb2:
        x = PHI <x' (bb0), ...>;
 
-   A similar transformation is done for MAX_EXPR.  */
+   A similar transformation is done for MAX_EXPR.
 
+
+   This pass also performs a fifth transformation of a slightly different
+   flavor.
+
+   Adjacent Load Hoisting
+   ----------------------
+   
+   This transformation replaces
+
+     bb0:
+       if (...) goto bb2; else goto bb1;
+     bb1:
+       x1 = (<expr>).field1;
+       goto bb3;
+     bb2:
+       x2 = (<expr>).field2;
+     bb3:
+       # x = PHI <x1, x2>;
+
+   with
+
+     bb0:
+       x1 = (<expr>).field1;
+       x2 = (<expr>).field2;
+       if (...) goto bb2; else goto bb1;
+     bb1:
+       goto bb3;
+     bb2:
+     bb3:
+       # x = PHI <x1, x2>;
+
+   The purpose of this transformation is to enable generation of conditional
+   move instructions such as Intel CMOVE or PowerPC ISEL.  Because one of
+   the loads is speculative, the transformation is restricted to very
+   specific cases to avoid introducing a page fault.  We are looking for
+   the common idiom:
+
+     if (...)
+       x = y->left;
+     else
+       x = y->right;
+
+   where left and right are typically adjacent pointers in a tree structure.  */
+
 static unsigned int
 tree_ssa_phiopt (void)
 {
-  return tree_ssa_phiopt_worker (false);
+  return tree_ssa_phiopt_worker (false, gate_hoist_loads ());
 }
 
 /* This pass tries to transform conditional stores into unconditional
@@ -190,7 +245,7 @@  tree_ssa_phiopt (void)
 static unsigned int
 tree_ssa_cs_elim (void)
 {
-  return tree_ssa_phiopt_worker (true);
+  return tree_ssa_phiopt_worker (true, false);
 }
 
 /* Return the singleton PHI in the SEQ of PHIs for edges E0 and E1. */
@@ -227,9 +282,11 @@  static tree condstoretemp;
 /* The core routine of conditional store replacement and normal
    phi optimizations.  Both share much of the infrastructure in how
    to match applicable basic block patterns.  DO_STORE_ELIM is true
-   when we want to do conditional store replacement, false otherwise.  */
+   when we want to do conditional store replacement, false otherwise.
+   DO_HOIST_LOADS is true when we want to hoist adjacent loads out 
+   of diamond control flow patterns, false otherwise.  */
 static unsigned int
-tree_ssa_phiopt_worker (bool do_store_elim)
+tree_ssa_phiopt_worker (bool do_store_elim, bool do_hoist_loads)
 {
   basic_block bb;
   basic_block *bb_order;
@@ -312,6 +369,20 @@  static unsigned int
 	    cfgchanged = true;
 	  continue;
 	}
+      else if (do_hoist_loads
+		 && EDGE_SUCC (bb1, 0)->dest == EDGE_SUCC (bb2, 0)->dest)
+	{
+	  basic_block bb3 = EDGE_SUCC (bb1, 0)->dest;
+
+	  if (single_succ_p (bb1)
+	      && single_succ_p (bb2)
+	      && single_pred_p (bb1)
+	      && single_pred_p (bb2)
+	      && EDGE_COUNT (bb->succs) == 2
+	      && EDGE_COUNT (bb3->preds) == 2)
+	    hoist_adjacent_loads (bb, bb1, bb2, bb3);
+	  continue;
+	}
       else
 	continue;      
 
@@ -1707,6 +1778,206 @@  cond_if_else_store_replacement (basic_block then_b
   return ok;
 }
 
+/* Return TRUE if STMT has a VUSE whose corresponding VDEF is in BB.  */
+
+static bool
+local_mem_dependence (gimple stmt, basic_block bb)
+{
+  tree vuse = gimple_vuse (stmt);
+  gimple def;
+
+  if (!vuse)
+    return false;
+
+  def = SSA_NAME_DEF_STMT (vuse);
+  return (def && gimple_bb (def) == bb);
+}
+
+/* Given a "diamond" control-flow pattern where BB0 tests a condition,
+   BB1 and BB2 are "then" and "else" blocks dependent on this test,
+   and BB3 rejoins control flow following BB1 and BB2, look for 
+   opportunities to hoist loads as follows.  If BB3 contains a PHI of
+   two loads, one each occurring in BB1 and BB2, and the loads are
+   provably of adjacent fields in the same structure, then move both
+   loads into BB0.  Of course this can only be done if there are no
+   dependencies preventing such motion.
+
+   One of the hoisted loads will always be speculative, so the
+   transformation is currently conservative:
+
+    - The fields must be strictly adjacent.
+    - The two fields must occupy a single memory block that is
+      guaranteed to not cross a page boundary.
+
+    The last is difficult to prove, as such memory blocks should be
+    aligned on the minimum of the stack alignment boundary and the
+    alignment guaranteed by heap allocation interfaces.  Thus we rely
+    on a parameter for the alignment value.
+
+    Provided a good value is used for the last case, the first
+    restriction could possibly be relaxed.  */
+
+static void
+hoist_adjacent_loads (basic_block bb0, basic_block bb1,
+		      basic_block bb2, basic_block bb3)
+{
+  int param_align = PARAM_VALUE (PARAM_MIN_CMOVE_STRUCT_ALIGN);
+  gimple_stmt_iterator gsi;
+
+  /* Walk the phis in bb3 looking for an opportunity.  We are looking
+     for phis of two SSA names, one each of which is defined in bb1 and
+     bb2.  */
+  for (gsi = gsi_start_phis (bb3); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      gimple phi_stmt = gsi_stmt (gsi);
+      gimple def1, def2, defswap;
+      tree arg1, arg2, ref1, ref2, field1, field2, fieldswap;
+      tree tree_offset1, tree_offset2, tree_size2, next;
+      int offset1, offset2, size2, block_offset;
+      unsigned align1;
+      gimple_stmt_iterator gsi2;
+
+      if (gimple_phi_num_args (phi_stmt) != 2)
+	continue;
+
+      arg1 = gimple_phi_arg_def (phi_stmt, 0);
+      arg2 = gimple_phi_arg_def (phi_stmt, 1);
+      
+      if (TREE_CODE (arg1) != SSA_NAME
+	  || TREE_CODE (arg2) != SSA_NAME
+	  || SSA_NAME_IS_DEFAULT_DEF (arg1)
+	  || SSA_NAME_IS_DEFAULT_DEF (arg2))
+	continue;
+
+      def1 = SSA_NAME_DEF_STMT (arg1);
+      def2 = SSA_NAME_DEF_STMT (arg2);
+
+      if ((gimple_bb (def1) != bb1 || gimple_bb (def2) != bb2)
+	  && (gimple_bb (def2) != bb1 || gimple_bb (def1) != bb2))
+	continue;
+
+      /* Unless -fhoist-adjacent-loads was specified, check the mode
+	 of the arguments to be sure a conditional move can be generated
+	 for it.  */
+      if (flag_hoist_adjacent_loads != 1
+	  && !optab_handler (cmov_optab, TYPE_MODE (TREE_TYPE (arg1))))
+	continue;
+
+      /* Both statements must be assignments whose RHS is a COMPONENT_REF.  */
+      if (!gimple_assign_single_p (def1)
+	  || !gimple_assign_single_p (def2))
+	continue;
+
+      ref1 = gimple_assign_rhs1 (def1);
+      ref2 = gimple_assign_rhs1 (def2);
+
+      if (TREE_CODE (ref1) != COMPONENT_REF
+	  || TREE_CODE (ref2) != COMPONENT_REF)
+	continue;
+
+      /* The zeroth operand of the two component references must be
+	 identical.  It is not sufficient to compare get_base_address of
+	 the two references, because this could allow for different
+	 elements of the same array in the two trees.  It is not safe to
+	 assume that the existence of one array element implies the
+	 existence of a different one.  */
+      if (!operand_equal_p (TREE_OPERAND (ref1, 0), TREE_OPERAND (ref2, 0), 0))
+	continue;
+
+      field1 = TREE_OPERAND (ref1, 1);
+      field2 = TREE_OPERAND (ref2, 1);
+
+      /* Check for field adjacency, and ensure field1 comes first.  */
+      for (next = DECL_CHAIN (field1);
+	   next && TREE_CODE (next) != FIELD_DECL;
+	   next = DECL_CHAIN (next))
+	;
+
+      if (!next || !operand_equal_p (next, field2, 0))
+	{
+	  for (next = DECL_CHAIN (field2);
+	       next && TREE_CODE (next) != FIELD_DECL;
+	       next = DECL_CHAIN (next))
+	    ;
+
+	  if (!next || !operand_equal_p (next, field1, 0))
+	    continue;
+
+	  fieldswap = field1;
+	  field1 = field2;
+	  field2 = fieldswap;
+	  defswap = def1;
+	  def1 = def2;
+	  def2 = defswap;
+	}
+
+      /* Check for proper alignment of the first field.  */
+      tree_offset1 = bit_position (field1);
+      tree_offset2 = bit_position (field2);
+      tree_size2 = DECL_SIZE_UNIT (field2);
+
+      if (TREE_CODE (tree_size2) != INTEGER_CST
+	  || !host_integerp (tree_offset1, 0)
+	  || !host_integerp (tree_offset2, 0)
+	  || !host_integerp (tree_size2, 0))
+	continue;
+
+      offset1 = TREE_INT_CST_LOW (tree_offset1);
+      offset2 = TREE_INT_CST_LOW (tree_offset2);
+      size2 = TREE_INT_CST_LOW (tree_size2);
+      align1 = DECL_ALIGN (field1);
+
+      if (offset1 % BITS_PER_UNIT != 0 || offset1 % align1 != 0)
+	continue;
+
+      offset1 /= BITS_PER_UNIT;
+      offset2 /= BITS_PER_UNIT;
+
+      /* The two field references must fit within a block of memory that
+	 is guaranteed to be on the same page.  */
+      block_offset = offset1 % param_align;
+
+      if (block_offset + offset2 - offset1 + size2 > param_align)
+	continue;
+
+      /* The two expressions cannot be dependent upon vdefs defined
+	 in bb1/bb2.  */
+      if (local_mem_dependence (def1, bb1) || local_mem_dependence (def2, bb2))
+	continue;
+
+      /* The conditions are satisfied; hoist the loads from bb1 and bb2 into
+	 bb0.  We hoist the first one first so that a cache miss is handled
+         efficiently regardless of hardware cache-fill policy.  */
+      gsi2 = gsi_for_stmt (def1);
+      gsi_move_to_bb_end (&gsi2, bb0);
+      gsi2 = gsi_for_stmt (def2);
+      gsi_move_to_bb_end (&gsi2, bb0);
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	{
+	  fprintf (dump_file,
+		   "\nHoisting adjacent loads from %d and %d into %d: \n",
+		   bb1->index, bb2->index, bb0->index);
+	  print_gimple_stmt (dump_file, def1, 0, TDF_VOPS|TDF_MEMSYMS);
+	  print_gimple_stmt (dump_file, def2, 0, TDF_VOPS|TDF_MEMSYMS);
+	}
+    }
+}
+
+/* Determine whether we should attempt to hoist adjacent loads out of
+   diamond patterns in pass_phiopt.  Always hoist loads if
+   -fhoist-adjacent-loads is specified, or if the target machine has
+   a conditional move instruction and -fno-hoist-adjacent-loads is
+   not specified.  */
+
+static bool
+gate_hoist_loads (void)
+{
+  return ((optimize > 1)
+	  && (flag_hoist_adjacent_loads == 1
+	      || (HAVE_conditional_move && flag_hoist_adjacent_loads != 0)));
+}
+
 /* Always do these optimizations if we have SSA
    trees to work on.  */
 static bool
Index: gcc/common.opt
===================================================================
--- gcc/common.opt	(revision 187728)
+++ gcc/common.opt	(working copy)
@@ -1186,6 +1186,11 @@  fgraphite-identity
 Common Report Var(flag_graphite_identity) Optimization
 Enable Graphite Identity transformation
 
+fhoist-adjacent-loads
+Common Report Var(flag_hoist_adjacent_loads) Init(-1) Optimization
+Enable hoisting adjacent loads to encourage generating conditional move
+instructions
+
 floop-parallelize-all
 Common Report Var(flag_loop_parallelize_all) Optimization
 Mark all loops as parallel
Index: gcc/Makefile.in
===================================================================
--- gcc/Makefile.in	(revision 187728)
+++ gcc/Makefile.in	(working copy)
@@ -2355,7 +2355,8 @@  tree-ssa-phiopt.o : tree-ssa-phiopt.c $(CONFIG_H)
    $(TM_H) $(GGC_H) $(TREE_H) $(TM_P_H) $(BASIC_BLOCK_H) \
    $(TREE_FLOW_H) $(TREE_PASS_H) $(TREE_DUMP_H) langhooks.h $(FLAGS_H) \
    $(DIAGNOSTIC_H) $(TIMEVAR_H) pointer-set.h domwalk.h $(CFGLOOP_H) \
-   $(TREE_DATA_REF_H) tree-pretty-print.h
+   $(TREE_DATA_REF_H) tree-pretty-print.h gimple-pretty-print.h \
+   insn-config.h $(EXPR_H) $(OPTABS_H)
 tree-nrv.o : tree-nrv.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
    $(TM_H) $(TREE_H) $(FUNCTION_H) $(BASIC_BLOCK_H) $(FLAGS_H) \
    $(DIAGNOSTIC_H) $(TREE_FLOW_H) $(TIMEVAR_H) $(TREE_DUMP_H) $(TREE_PASS_H) \
Index: gcc/params.def
===================================================================
--- gcc/params.def	(revision 187728)
+++ gcc/params.def	(working copy)
@@ -979,6 +979,13 @@  DEFPARAM (PARAM_MAX_TRACKED_STRLENS,
 	  "track string lengths",
 	  1000, 0, 0)
 
+/* For adjacent load hoisting transformation in tree-ssa-phiopt.c.  */
+DEFPARAM (PARAM_MIN_CMOVE_STRUCT_ALIGN,
+	  "min-cmove-struct-align",
+	  "Minimum byte alignment to assume for structures in the stack "
+	  "or heap when considering load hoisting for conditional moves",
+	  16, 8, 256)
+
 /*
 Local variables:
 mode:c