Patchwork [RFC] Preserve loop info from tree loop opts to after RTL loop opts (PR44688)

login
register
mail settings
Submitter Richard Guenther
Date Feb. 23, 2012, 3:08 p.m.
Message ID <alpine.LNX.2.00.1202231532520.18230@zhemvz.fhfr.qr>
Download mbox | patch
Permalink /patch/142647/
State New
Headers show

Comments

Richard Guenther - Feb. 23, 2012, 3:08 p.m.
The attached patch blob makes us preserve loop information (the loop
tree) from the start of tree loop optimizations until the end of
RTL loop optimizations.  The motivation for this is to fix excessive
prefetching and loop unrolling we perform on (for example) prologue
loops created by the vectorizer.  The reason why we do so is that
we are not able to analyze/bound their number of iterations.  But
of course the vectorizer perfectly knows a bound to its prologue loops,
so why not record that information ... this is what the inlined patch
does, as well as adjust passes to actually _use_ an upper bound if
available.

The whole patch does not yet pass bootstrap, but the C/C++ testsuites
are fine (and the target libs build).

Thus, inline the "meat" of the patch that makes us perform less
unrolling/prefetching.  For example on 437.leslie3d this reduces
code size from

   text    data     bss     dec     hex filename
 438423       0    4184  442607   6c0ef tml.o

to

   text    data     bss     dec     hex filename
 368903       0    4184  373087   5b15f tml.o

at -Ofast -funroll-loops and from

   text    data     bss     dec     hex filename
 741167       0    4184  745351   b5f87 tml.o

to

   text    data     bss     dec     hex filename
 561479       0    4184  565663   8a19f tml.o

at -Ofast -funroll-loops -march=barcelona.

Attached you find the collection of changes I had to make to preserve 
loops.  The main idea is to make loop_optimizer_finalize a no-op if
PROP_loops is set on the current function.  I added tons of checking
to make sure loop info is correct as well as dominators (loop
verification needs dominators).  I plan to split out the verification
bits (or at least its fixes), then the generic CFG bits that preserve
loops on the RTL side (and the few tree cases I catched).

Any comments on that plan?

Thanks,
Richard.
Index: gcc/loop-init.c
===================================================================
*** gcc/loop-init.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/loop-init.c	2012-02-23 14:06:15.000000000 +0100
*************** along with GCC; see the file COPYING3.
*** 42,56 ****
  void
  loop_optimizer_init (unsigned flags)
  {
!   struct loops *loops;
  
!   gcc_assert (!current_loops);
!   loops = ggc_alloc_cleared_loops ();
  
!   /* Find the loops.  */
  
!   flow_loops_find (loops);
!   current_loops = loops;
  
    if (flags & LOOPS_MAY_HAVE_MULTIPLE_LATCHES)
      {
--- 42,66 ----
  void
  loop_optimizer_init (unsigned flags)
  {
!   /* Ensure that the dominators are computed.  */
!   calculate_dominance_info (CDI_DOMINATORS);
  
!   if (!current_loops)
!     {
!       struct loops *loops = ggc_alloc_cleared_loops ();
! 
!       gcc_assert (!(cfun->curr_properties & PROP_loops));
  
!       /* Find the loops.  */
  
!       flow_loops_find (loops);
!       current_loops = loops;
!     }
!   else
!     {
!       gcc_assert (cfun->curr_properties & PROP_loops);
!       verify_loop_structure ();
!     }
  
    if (flags & LOOPS_MAY_HAVE_MULTIPLE_LATCHES)
      {
*************** loop_optimizer_finalize (void)
*** 105,110 ****
--- 115,136 ----
    struct loop *loop;
    basic_block bb;
  
+   if (loops_state_satisfies_p (LOOPS_HAVE_RECORDED_EXITS))
+     release_recorded_exits ();
+ 
+   /* If we should preserve loop structure, do not free it but clear
+      flags that advanced properties are there as we are not preserving
+      that in full.  */
+   if (cfun->curr_properties & PROP_loops)
+     {
+       loops_state_clear (LOOP_CLOSED_SSA
+ 			 | LOOPS_HAVE_MARKED_IRREDUCIBLE_REGIONS
+ 			 | LOOPS_HAVE_PREHEADERS
+ 			 | LOOPS_HAVE_SIMPLE_LATCHES
+ 			 | LOOPS_HAVE_FALLTHRU_PREHEADERS);
+       return;
+     }
+ 
    gcc_assert (current_loops != NULL);
  
    FOR_EACH_LOOP (li, loop, 0)
*************** loop_optimizer_finalize (void)
*** 113,120 ****
      }
  
    /* Clean up.  */
-   if (loops_state_satisfies_p (LOOPS_HAVE_RECORDED_EXITS))
-     release_recorded_exits ();
    flow_loops_free (current_loops);
    ggc_free (current_loops);
    current_loops = NULL;
--- 139,144 ----
*************** struct rtl_opt_pass pass_rtl_loop_init =
*** 201,206 ****
--- 225,232 ----
  static unsigned int
  rtl_loop_done (void)
  {
+   /* No longer preserve loops, remove them now.  */
+   cfun->curr_properties &= ~PROP_loops;
    loop_optimizer_finalize ();
    free_dominance_info (CDI_DOMINATORS);
  
*************** struct rtl_opt_pass pass_rtl_loop_done =
*** 224,230 ****
    TV_LOOP,                              /* tv_id */
    0,                                    /* properties_required */
    0,                                    /* properties_provided */
!   0,                                    /* properties_destroyed */
    0,                                    /* todo_flags_start */
    TODO_verify_flow
      | TODO_verify_rtl_sharing           /* todo_flags_finish */
--- 250,256 ----
    TV_LOOP,                              /* tv_id */
    0,                                    /* properties_required */
    0,                                    /* properties_provided */
!   PROP_loops,                           /* properties_destroyed */
    0,                                    /* todo_flags_start */
    TODO_verify_flow
      | TODO_verify_rtl_sharing           /* todo_flags_finish */
Index: gcc/passes.c
===================================================================
*** gcc/passes.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/passes.c	2012-02-23 14:06:15.000000000 +0100
*************** execute_function_todo (void *data)
*** 1729,1738 ****
--- 1729,1746 ----
      verify_flow_info ();
    if (flags & TODO_verify_stmts)
      verify_gimple_in_cfg (cfun);
+   if (current_loops
+       /* Between IRA and reload loops are broken but preserved.  */
+       && (cfun->curr_properties & PROP_loops))
+     verify_loop_structure ();
    if (current_loops && loops_state_satisfies_p (LOOP_CLOSED_SSA))
      verify_loop_closed_ssa (false);
    if (flags & TODO_verify_rtl_sharing)
      verify_rtl_sharing ();
+   if (cfun->cfg && dom_info_available_p (CDI_DOMINATORS))
+     verify_dominators (CDI_DOMINATORS);
+   if (cfun->cfg)
+     gcc_assert (!dom_info_available_p (CDI_POST_DOMINATORS));
  #endif
  
    cfun->last_verified = flags & TODO_verify_all;
Index: gcc/ira.c
===================================================================
*** gcc/ira.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/ira.c	2012-02-23 14:06:15.000000000 +0100
*************** ira (FILE *f)
*** 3613,3619 ****
    ira_load_cost = ira_store_cost = ira_shuffle_cost = 0;
    ira_move_loops_num = ira_additional_jumps_num = 0;
  
!   ira_assert (current_loops == NULL);
    if (flag_ira_region == IRA_REGION_ALL || flag_ira_region == IRA_REGION_MIXED)
      {
        flow_loops_find (&ira_loops);
--- 3613,3623 ----
    ira_load_cost = ira_store_cost = ira_shuffle_cost = 0;
    ira_move_loops_num = ira_additional_jumps_num = 0;
  
!   if (current_loops)
!     {
!       cfun->curr_properties &= ~PROP_loops;
!       loop_optimizer_finalize ();
!     }
    if (flag_ira_region == IRA_REGION_ALL || flag_ira_region == IRA_REGION_MIXED)
      {
        flow_loops_find (&ira_loops);
Index: gcc/cgraph.c
===================================================================
*** gcc/cgraph.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/cgraph.c	2012-02-23 14:06:15.000000000 +0100
*************** The callgraph:
*** 99,104 ****
--- 99,105 ----
  #include "ipa-utils.h"
  #include "lto-streamer.h"
  #include "ipa-inline.h"
+ #include "cfgloop.h"
  
  const char * const ld_plugin_symbol_resolution_names[]=
  {
*************** cgraph_release_function_body (struct cgr
*** 1363,1368 ****
--- 1364,1375 ----
      {
        tree old_decl = current_function_decl;
        push_cfun (DECL_STRUCT_FUNCTION (node->decl));
+       if (cfun->cfg
+ 	  && current_loops)
+ 	{
+ 	  cfun->curr_properties &= ~PROP_loops;
+ 	  loop_optimizer_finalize ();
+ 	}
        if (cfun->gimple_df)
  	{
  	  current_function_decl = node->decl;
*************** cgraph_release_function_body (struct cgr
*** 1379,1385 ****
  	}
        if (cfun->value_histograms)
  	free_histograms ();
-       gcc_assert (!current_loops);
        pop_cfun();
        gimple_set_body (node->decl, NULL);
        VEC_free (ipa_opt_pass, heap,
--- 1386,1391 ----
Index: gcc/cfgexpand.c
===================================================================
*** gcc/cfgexpand.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/cfgexpand.c	2012-02-23 14:16:43.000000000 +0100
*************** along with GCC; see the file COPYING3.
*** 47,52 ****
--- 47,53 ----
  #include "ssaexpand.h"
  #include "bitmap.h"
  #include "sbitmap.h"
+ #include "cfgloop.h"
  #include "insn-attr.h" /* For INSN_SCHEDULING.  */
  
  /* This variable holds information helping the rewriting of SSA trees
*************** expand_gimple_cond (basic_block bb, gimp
*** 1938,1943 ****
--- 1939,1946 ----
    false_edge->flags |= EDGE_FALLTHRU;
    new_bb->count = false_edge->count;
    new_bb->frequency = EDGE_FREQUENCY (false_edge);
+   if (current_loops && bb->loop_father)
+     add_bb_to_loop (new_bb, bb->loop_father);
    new_edge = make_edge (new_bb, dest, 0);
    new_edge->probability = REG_BR_PROB_BASE;
    new_edge->count = new_bb->count;
*************** construct_init_block (void)
*** 4103,4108 ****
--- 4106,4113 ----
  				   ENTRY_BLOCK_PTR);
    init_block->frequency = ENTRY_BLOCK_PTR->frequency;
    init_block->count = ENTRY_BLOCK_PTR->count;
+   if (current_loops && ENTRY_BLOCK_PTR->loop_father)
+     add_bb_to_loop (init_block, ENTRY_BLOCK_PTR->loop_father);
    if (e)
      {
        first_block = e->dest;
*************** construct_exit_block (void)
*** 4170,4175 ****
--- 4175,4182 ----
  				   EXIT_BLOCK_PTR->prev_bb);
    exit_block->frequency = EXIT_BLOCK_PTR->frequency;
    exit_block->count = EXIT_BLOCK_PTR->count;
+   if (current_loops && EXIT_BLOCK_PTR->loop_father)
+     add_bb_to_loop (exit_block, EXIT_BLOCK_PTR->loop_father);
  
    ix = 0;
    while (ix < EDGE_COUNT (EXIT_BLOCK_PTR->preds))
*************** gimple_expand_cfg (void)
*** 4362,4367 ****
--- 4369,4376 ----
  
    /* Some backends want to know that we are expanding to RTL.  */
    currently_expanding_to_rtl = 1;
+   /* Dominators are not kept up-to-date as we may create new basic-blocks.  */
+   free_dominance_info (CDI_DOMINATORS);
  
    rtl_profile_for_bb (ENTRY_BLOCK_PTR);
  
*************** gimple_expand_cfg (void)
*** 4535,4540 ****
--- 4544,4551 ----
    timevar_push (TV_POST_EXPAND);
    /* We are no longer in SSA form.  */
    cfun->gimple_df->in_ssa_p = false;
+   if (current_loops)
+     loops_state_clear (LOOP_CLOSED_SSA);
  
    /* Expansion is used by optimization passes too, set maybe_hot_insn_p
       conservatively to true until they are all profile aware.  */
*************** gimple_expand_cfg (void)
*** 4608,4617 ****
    sbitmap_free (blocks);
    purge_all_dead_edges ();
  
-   compact_blocks ();
- 
    expand_stack_alignment ();
  
  #ifdef ENABLE_CHECKING
    verify_flow_info ();
  #endif
--- 4619,4631 ----
    sbitmap_free (blocks);
    purge_all_dead_edges ();
  
    expand_stack_alignment ();
  
+   /* ???  We cannot remove trivially dead insns here as for example
+      the DRAP reg on i?86 is not magically live at this point.
+      gcc.c-torture/execute/ipa-sra-2.c execution, -Os -m32 fails otherwise.  */
+   cleanup_cfg (CLEANUP_NO_INSN_DEL);
+ 
  #ifdef ENABLE_CHECKING
    verify_flow_info ();
  #endif
Index: gcc/cfgloop.c
===================================================================
*** gcc/cfgloop.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/cfgloop.c	2012-02-23 14:06:15.000000000 +0100
*************** verify_loop_structure (void)
*** 1317,1322 ****
--- 1317,1326 ----
    unsigned num = number_of_loops ();
    loop_iterator li;
    struct loop_exit *exit, *mexit;
+   bool dom_available = dom_info_available_p (CDI_DOMINATORS);
+ 
+   /* Ensure that the dominators are computed.  */
+   calculate_dominance_info (CDI_DOMINATORS);
  
    /* Check sizes.  */
    sizes = XCNEWVEC (unsigned, num);
*************** verify_loop_structure (void)
*** 1382,1387 ****
--- 1386,1399 ----
  	      err = 1;
  	    }
  	}
+ #if 0
+       if (loop->latch
+ 	  && loop->latch->loop_father != loop)
+ 	{
+ 	  error ("loop %d%'s latch does not belong directly to it", i);
+ 	  err = 1;
+ 	}
+ #endif
        if (loop->header->loop_father != loop)
  	{
  	  error ("loop %d%'s header does not belong directly to it", i);
*************** verify_loop_structure (void)
*** 1560,1565 ****
--- 1572,1579 ----
    gcc_assert (!err);
  
    free (sizes);
+   if (!dom_available)
+     free_dominance_info (CDI_DOMINATORS);
  }
  
  /* Returns latch edge of LOOP.  */
Index: gcc/tree-pass.h
===================================================================
*** gcc/tree-pass.h.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/tree-pass.h	2012-02-23 14:06:15.000000000 +0100
*************** struct dump_file_info
*** 239,244 ****
--- 239,245 ----
  #define PROP_gimple_lomp	(1 << 8)	/* lowered OpenMP directives */
  #define PROP_cfglayout	 	(1 << 9)	/* cfglayout mode on RTL */
  #define PROP_gimple_lcx		(1 << 10)       /* lowered complex */
+ #define PROP_loops		(1 << 11)	/* preserve loop structures */
  
  #define PROP_trees \
    (PROP_gimple_any | PROP_gimple_lcf | PROP_gimple_leh | PROP_gimple_lomp)
Index: gcc/tree-ssa-loop.c
===================================================================
*** gcc/tree-ssa-loop.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/tree-ssa-loop.c	2012-02-23 14:06:15.000000000 +0100
*************** struct gimple_opt_pass pass_tree_loop_in
*** 92,98 ****
    0,					/* static_pass_number */
    TV_TREE_LOOP_INIT,			/* tv_id */
    PROP_cfg,				/* properties_required */
!   0,					/* properties_provided */
    0,					/* properties_destroyed */
    0,					/* todo_flags_start */
    0             			/* todo_flags_finish */
--- 92,98 ----
    0,					/* static_pass_number */
    TV_TREE_LOOP_INIT,			/* tv_id */
    PROP_cfg,				/* properties_required */
!   PROP_loops,				/* properties_provided */
    0,					/* properties_destroyed */
    0,					/* todo_flags_start */
    0             			/* todo_flags_finish */
Index: gcc/cfgcleanup.c
===================================================================
*** gcc/cfgcleanup.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/cfgcleanup.c	2012-02-23 14:06:15.000000000 +0100
*************** merge_blocks_move (edge e, basic_block b
*** 779,784 ****
--- 779,789 ----
    if (e->flags & EDGE_FALLTHRU)
      {
        int b_index = b->index, c_index = c->index;
+ 
+       /* Protect the loop latches.  */
+       if (current_loops && c->loop_father->latch == c)
+ 	return NULL;
+ 
        merge_blocks (b, c);
        update_forwarder_flag (b);
  
*************** cleanup_cfg (int mode)
*** 2976,2981 ****
--- 2981,3002 ----
    if (!(mode & CLEANUP_CFGLAYOUT))
      delete_dead_jumptables ();
  
+   if (changed
+       && current_loops)
+     {
+       bitmap changed_bbs;
+       timevar_push (TV_REPAIR_LOOPS);
+       /* The above doesn't preserve dominance info if available.  */
+       gcc_assert (!dom_info_available_p (CDI_DOMINATORS));
+       calculate_dominance_info (CDI_DOMINATORS);
+       changed_bbs = BITMAP_ALLOC (NULL);
+       fix_loop_structure (changed_bbs);
+       BITMAP_FREE (changed_bbs);
+       loops_state_clear (LOOPS_NEED_FIXUP);
+       free_dominance_info (CDI_DOMINATORS);
+       timevar_pop (TV_REPAIR_LOOPS);
+     }
+ 
    timevar_pop (TV_CLEANUP_CFG);
  
    return changed;
Index: gcc/cfghooks.c
===================================================================
*** gcc/cfghooks.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/cfghooks.c	2012-02-23 14:06:15.000000000 +0100
*************** delete_basic_block (basic_block bb)
*** 508,513 ****
--- 508,514 ----
  	{
  	  loop->header = NULL;
  	  loop->latch = NULL;
+ 	  loops_state_set (LOOPS_NEED_FIXUP);
  	}
  
        remove_bb_from_loops (bb);
*************** merge_blocks (basic_block a, basic_block
*** 682,689 ****
  
    cfg_hooks->merge_blocks (a, b);
  
    if (current_loops != NULL)
!     remove_bb_from_loops (b);
  
    /* Normally there should only be one successor of A and that is B, but
       partway though the merge of blocks for conditional_execution we'll
--- 683,700 ----
  
    cfg_hooks->merge_blocks (a, b);
  
+   /* If we merge a loop header into its predecessor, update the loop
+      structure.  */
    if (current_loops != NULL)
!     {
!       if (b->loop_father->header == b)
! 	{
! 	  remove_bb_from_loops (a);
! 	  add_bb_to_loop  (a, b->loop_father);
! 	  a->loop_father->header = a;
! 	}
!       remove_bb_from_loops (b);
!     }
  
    /* Normally there should only be one successor of A and that is B, but
       partway though the merge of blocks for conditional_execution we'll
*************** duplicate_block (basic_block bb, edge e,
*** 999,1004 ****
--- 1010,1027 ----
        struct loop *cloop = bb->loop_father;
        struct loop *copy = get_loop_copy (cloop);
        add_bb_to_loop (new_bb, copy ? copy : cloop);
+       /* If we copied the loop latch block but not the loop, adjust
+ 	 loop state.
+ 	 ???  If we copied the loop header block but not the loop
+ 	 we might either have created a loop copy or a loop with
+ 	 multiple entries.  In both cases we probably have to
+ 	 ditch the loops and arrange for a fixup.  */
+       if (!copy
+ 	  && cloop->latch == bb)
+ 	{
+ 	  cloop->latch = NULL;
+ 	  loops_state_set (LOOPS_MAY_HAVE_MULTIPLE_LATCHES);
+ 	}
      }
  
    return new_bb;
Index: gcc/dominance.c
===================================================================
*** gcc/dominance.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/dominance.c	2012-02-23 14:06:15.000000000 +0100
*************** calculate_dominance_info (enum cdi_direc
*** 639,645 ****
    bool reverse = (dir == CDI_POST_DOMINATORS) ? true : false;
  
    if (dom_computed[dir_index] == DOM_OK)
!     return;
  
    timevar_push (TV_DOMINANCE);
    if (!dom_info_available_p (dir))
--- 639,650 ----
    bool reverse = (dir == CDI_POST_DOMINATORS) ? true : false;
  
    if (dom_computed[dir_index] == DOM_OK)
!     {
! #ifdef ENABLE_CHECKING
!       verify_dominators (dir);
! #endif
!       return;
!     }
  
    timevar_push (TV_DOMINANCE);
    if (!dom_info_available_p (dir))
*************** calculate_dominance_info (enum cdi_direc
*** 667,672 ****
--- 672,681 ----
        free_dom_info (&di);
        dom_computed[dir_index] = DOM_NO_FAST_QUERY;
      }
+ #ifdef ENABLE_CHECKING
+   else
+     verify_dominators (dir);
+ #endif
  
    compute_dom_fast_query (dir);
  
Index: gcc/cfganal.c
===================================================================
*** gcc/cfganal.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/cfganal.c	2012-02-23 14:06:15.000000000 +0100
*************** along with GCC; see the file COPYING3.
*** 37,42 ****
--- 37,43 ----
  #include "bitmap.h"
  #include "sbitmap.h"
  #include "timevar.h"
+ #include "cfgloop.h"
  
  /* Store the data structures necessary for depth-first search.  */
  struct depth_first_search_dsS {
*************** forwarder_block_p (const_basic_block bb)
*** 94,99 ****
--- 95,111 ----
        || !single_succ_p (bb))
      return false;
  
+   /* Protect loop latches, headers and preheaders.  */
+   if (current_loops)
+     {
+       basic_block dest;
+       if (bb->loop_father->header == bb)
+ 	return false;
+       dest = EDGE_SUCC (bb, 0)->dest;
+       if (dest->loop_father->header == dest)
+ 	return false;
+     }
+ 
    for (insn = BB_HEAD (bb); insn != BB_END (bb); insn = NEXT_INSN (insn))
      if (INSN_P (insn) && flow_active_insn_p (insn))
        return false;
Index: gcc/cfgrtl.c
===================================================================
*** gcc/cfgrtl.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/cfgrtl.c	2012-02-23 14:06:15.000000000 +0100
*************** rtl_can_merge_blocks (basic_block a, bas
*** 727,732 ****
--- 727,736 ----
    if (BB_PARTITION (a) != BB_PARTITION (b))
      return false;
  
+   /* Protect the loop latches.  */
+   if (current_loops && b->loop_father->latch == b)
+     return false;
+ 
    /* There must be exactly one edge in between the blocks.  */
    return (single_succ_p (a)
  	  && single_succ (a) == b
*************** cfg_layout_can_merge_blocks_p (basic_blo
*** 2786,2791 ****
--- 2790,2799 ----
    if (BB_PARTITION (a) != BB_PARTITION (b))
      return false;
  
+   /* Protect the loop latches.  */
+   if (current_loops && b->loop_father->latch == b)
+     return false;
+ 
    /* If we would end up moving B's instructions, make sure it doesn't fall
       through into the exit block, since we cannot recover from a fallthrough
       edge into the exit block occurring in the middle of a function.  */
Index: gcc/cprop.c
===================================================================
*** gcc/cprop.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/cprop.c	2012-02-23 14:06:15.000000000 +0100
*************** along with GCC; see the file COPYING3.
*** 48,53 ****
--- 48,54 ----
  #include "df.h"
  #include "dbgcnt.h"
  #include "target.h"
+ #include "cfgloop.h"
  
  
  /* An obstack for our working variables.  */
*************** bypass_block (basic_block bb, rtx setcc,
*** 1608,1615 ****
  	  old_dest = e->dest;
  	  if (dest != NULL
  	      && dest != old_dest
! 	      && dest != EXIT_BLOCK_PTR)
              {
  	      redirect_edge_and_branch_force (e, dest);
  
  	      /* Copy the register setter to the redirected edge.
--- 1609,1634 ----
  	  old_dest = e->dest;
  	  if (dest != NULL
  	      && dest != old_dest
! 	      && dest != EXIT_BLOCK_PTR
! #if 0
! 	      /* Do not redirect loop latch edges.  */
! 	      && (current_loops == NULL
! 		  || e->src->loop_father->latch != e->src)
! #endif
! 	      )
              {
+ 	      if (current_loops != NULL
+ 		  && e->src->loop_father->latch == e->src)
+ 		{
+ 		  /* ???  Now we are creating (or may create) a loop
+ 		     with multiple entries.  Simply mark it for
+ 		     removal.  Alternatively we could not do this
+ 		     threading.  */
+ 		  e->src->loop_father->header = NULL;
+ 		  e->src->loop_father->latch = NULL;
+ 		  loops_state_set (LOOPS_NEED_FIXUP);
+ 		}
+ 
  	      redirect_edge_and_branch_force (e, dest);
  
  	      /* Copy the register setter to the redirected edge.
Index: gcc/except.c
===================================================================
*** gcc/except.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/except.c	2012-02-23 14:06:15.000000000 +0100
*************** along with GCC; see the file COPYING3.
*** 144,149 ****
--- 144,150 ----
  #include "tree-pass.h"
  #include "timevar.h"
  #include "tree-flow.h"
+ #include "cfgloop.h"
  
  /* Provide defaults for stuff that may not be defined when using
     sjlj exceptions.  */
*************** static basic_block
*** 898,904 ****
  emit_to_new_bb_before (rtx seq, rtx insn)
  {
    rtx last;
!   basic_block bb;
    edge e;
    edge_iterator ei;
  
--- 899,905 ----
  emit_to_new_bb_before (rtx seq, rtx insn)
  {
    rtx last;
!   basic_block bb, prev_bb;
    edge e;
    edge_iterator ei;
  
*************** emit_to_new_bb_before (rtx seq, rtx insn
*** 913,921 ****
    last = emit_insn_before (seq, insn);
    if (BARRIER_P (last))
      last = PREV_INSN (last);
!   bb = create_basic_block (seq, last, BLOCK_FOR_INSN (insn)->prev_bb);
    update_bb_for_insn (bb);
    bb->flags |= BB_SUPERBLOCK;
    return bb;
  }
  
--- 914,929 ----
    last = emit_insn_before (seq, insn);
    if (BARRIER_P (last))
      last = PREV_INSN (last);
!   prev_bb = BLOCK_FOR_INSN (insn)->prev_bb;
!   bb = create_basic_block (seq, last, prev_bb);
    update_bb_for_insn (bb);
    bb->flags |= BB_SUPERBLOCK;
+   if (current_loops)
+     {
+       add_bb_to_loop (bb, prev_bb->loop_father);
+       if (prev_bb->loop_father->header == prev_bb)
+ 	prev_bb->loop_father->header = bb;
+     }
    return bb;
  }
  
Index: gcc/omp-low.c
===================================================================
*** gcc/omp-low.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/omp-low.c	2012-02-23 14:06:15.000000000 +0100
*************** finalize_task_copyfn (gimple task_stmt)
*** 1243,1249 ****
  
    /* Inform the callgraph about the new function.  */
    DECL_STRUCT_FUNCTION (child_fn)->curr_properties
!     = cfun->curr_properties;
  
    old_fn = current_function_decl;
    push_cfun (child_cfun);
--- 1243,1249 ----
  
    /* Inform the callgraph about the new function.  */
    DECL_STRUCT_FUNCTION (child_fn)->curr_properties
!     = cfun->curr_properties & ~PROP_loops;
  
    old_fn = current_function_decl;
    push_cfun (child_cfun);
*************** expand_omp_taskreg (struct omp_region *r
*** 3563,3569 ****
  
        /* Inform the callgraph about the new function.  */
        DECL_STRUCT_FUNCTION (child_fn)->curr_properties
! 	= cfun->curr_properties;
        cgraph_add_new_function (child_fn, true);
  
        /* Fix the callgraph edges for child_cfun.  Those for cfun will be
--- 3563,3569 ----
  
        /* Inform the callgraph about the new function.  */
        DECL_STRUCT_FUNCTION (child_fn)->curr_properties
! 	= cfun->curr_properties & ~PROP_loops;
        cgraph_add_new_function (child_fn, true);
  
        /* Fix the callgraph edges for child_cfun.  Those for cfun will be
Index: gcc/tree-if-conv.c
===================================================================
*** gcc/tree-if-conv.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/tree-if-conv.c	2012-02-23 14:06:15.000000000 +0100
*************** combine_blocks (struct loop *loop)
*** 1712,1717 ****
--- 1712,1720 ----
  
    free (ifc_bbs);
    ifc_bbs = NULL;
+ 
+   /* Post-dominators are corrupt now.  */
+   free_dominance_info (CDI_POST_DOMINATORS);
  }
  
  /* If-convert LOOP when it is legal.  For the moment this pass has no
Index: gcc/tree-inline.c
===================================================================
*** gcc/tree-inline.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/tree-inline.c	2012-02-23 14:06:15.000000000 +0100
*************** initialize_cfun (tree new_fndecl, tree c
*** 2093,2099 ****
    cfun->static_chain_decl = src_cfun->static_chain_decl;
    cfun->nonlocal_goto_save_area = src_cfun->nonlocal_goto_save_area;
    cfun->function_end_locus = src_cfun->function_end_locus;
!   cfun->curr_properties = src_cfun->curr_properties;
    cfun->last_verified = src_cfun->last_verified;
    cfun->va_list_gpr_size = src_cfun->va_list_gpr_size;
    cfun->va_list_fpr_size = src_cfun->va_list_fpr_size;
--- 2093,2099 ----
    cfun->static_chain_decl = src_cfun->static_chain_decl;
    cfun->nonlocal_goto_save_area = src_cfun->nonlocal_goto_save_area;
    cfun->function_end_locus = src_cfun->function_end_locus;
!   cfun->curr_properties = src_cfun->curr_properties & ~PROP_loops;
    cfun->last_verified = src_cfun->last_verified;
    cfun->va_list_gpr_size = src_cfun->va_list_gpr_size;
    cfun->va_list_fpr_size = src_cfun->va_list_fpr_size;
Index: gcc/tree-parloops.c
===================================================================
*** gcc/tree-parloops.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/tree-parloops.c	2012-02-23 14:06:15.000000000 +0100
*************** create_parallel_loop (struct loop *loop,
*** 1740,1745 ****
--- 1740,1749 ----
    gimple_set_location (stmt, loc);
    gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
  
+   /* After the above dom info is hosed.  Re-compute it.  */
+   free_dominance_info (CDI_DOMINATORS);
+   calculate_dominance_info (CDI_DOMINATORS);
+ 
    return paral_bb;
  }
  
Index: gcc/tree-mudflap.c
===================================================================
*** gcc/tree-mudflap.c.orig	2012-02-23 13:24:54.000000000 +0100
--- gcc/tree-mudflap.c	2012-02-23 14:06:15.000000000 +0100
*************** along with GCC; see the file COPYING3.
*** 45,50 ****
--- 45,52 ----
  #include "cgraph.h"
  #include "gimple.h"
  
+ extern void add_bb_to_loop (basic_block, struct loop *);
+ 
  /* Internal function decls */
  
  
*************** mf_build_check_statement_for (tree base,
*** 560,565 ****
--- 562,571 ----
        set_immediate_dominator (CDI_DOMINATORS, join_bb, cond_bb);
      }
  
+   /* Update loop info.  */
+   if (current_loops)
+     add_bb_to_loop (then_bb, cond_bb->loop_father);
+ 
    /* Build our local variables.  */
    mf_elem = make_rename_temp (mf_cache_structptr_type, "__mf_elem");
    mf_base = make_rename_temp (mf_uintptr_type, "__mf_base");
Index: gcc/optabs.c
===================================================================
*** gcc/optabs.c.orig	2012-02-23 14:06:11.000000000 +0100
--- gcc/optabs.c	2012-02-23 14:06:21.000000000 +0100
*************** expand_binop (enum machine_mode mode, op
*** 2116,2122 ****
  
        target = gen_reg_rtx (mode);
        emit_libcall_block (insns, target, value,
! 			  gen_rtx_fmt_ee (binoptab->code, mode, op0, op1));
  
        return target;
      }
--- 2116,2127 ----
  
        target = gen_reg_rtx (mode);
        emit_libcall_block (insns, target, value,
! 			  (binoptab == addv_optab
! 			   || binoptab == subv_optab
! 			   || binoptab == smulv_optab)
! 			  && cfun->can_throw_non_call_exceptions
! 			  ? NULL_RTX
! 			  : gen_rtx_fmt_ee (binoptab->code, mode, op0, op1));
  
        return target;
      }
*************** expand_unop (enum machine_mode mode, opt
*** 3197,3203 ****
  	eq_value = simplify_gen_unary (TRUNCATE, outmode, eq_value, mode);
        else if (GET_MODE_SIZE (outmode) > GET_MODE_SIZE (mode))
  	eq_value = simplify_gen_unary (ZERO_EXTEND, outmode, eq_value, mode);
!       emit_libcall_block (insns, target, value, eq_value);
  
        return target;
      }
--- 3202,3212 ----
  	eq_value = simplify_gen_unary (TRUNCATE, outmode, eq_value, mode);
        else if (GET_MODE_SIZE (outmode) > GET_MODE_SIZE (mode))
  	eq_value = simplify_gen_unary (ZERO_EXTEND, outmode, eq_value, mode);
!       emit_libcall_block (insns, target, value,
! 			  (unoptab == negv_optab
! 			   || unoptab == absv_optab)
! 			  && cfun->can_throw_non_call_exceptions
! 			  ? NULL_RTX : eq_value);
  
        return target;
      }
*************** emit_libcall_block (rtx insns, rtx targe
*** 3789,3795 ****
    /* If we're using non-call exceptions, a libcall corresponding to an
       operation that may trap may also trap.  */
    /* ??? See the comment in front of make_reg_eh_region_note.  */
!   if (cfun->can_throw_non_call_exceptions && may_trap_p (equiv))
      {
        for (insn = insns; insn; insn = NEXT_INSN (insn))
  	if (CALL_P (insn))
--- 3798,3805 ----
    /* If we're using non-call exceptions, a libcall corresponding to an
       operation that may trap may also trap.  */
    /* ??? See the comment in front of make_reg_eh_region_note.  */
!   if (cfun->can_throw_non_call_exceptions
!       && (!equiv || may_trap_p (equiv)))
      {
        for (insn = insns; insn; insn = NEXT_INSN (insn))
  	if (CALL_P (insn))
*************** emit_libcall_block (rtx insns, rtx targe
*** 3865,3871 ****
      }
  
    last = emit_move_insn (target, result);
!   set_dst_reg_note (last, REG_EQUAL, copy_rtx (equiv), target);
  
    if (final_dest != target)
      emit_move_insn (final_dest, target);
--- 3875,3882 ----
      }
  
    last = emit_move_insn (target, result);
!   if (equiv)
!     set_dst_reg_note (last, REG_EQUAL, copy_rtx (equiv), target);
  
    if (final_dest != target)
      emit_move_insn (final_dest, target);
Index: gcc/loop-iv.c
===================================================================
*** gcc/loop-iv.c.orig	2011-07-11 17:02:51.000000000 +0200
--- gcc/loop-iv.c	2012-02-23 15:22:14.000000000 +0100
*************** iv_number_of_iterations (struct loop *lo
*** 2764,2769 ****
--- 2764,2773 ----
      {
        if (!desc->niter_max)
  	desc->niter_max = determine_max_iter (loop, desc, old_niter);
+       if (loop->any_upper_bound
+ 	  && double_int_fits_in_uhwi_p (loop->nb_iterations_upper_bound)
+ 	  && loop->nb_iterations_upper_bound.low < desc->niter_max)
+ 	desc->niter_max = loop->nb_iterations_upper_bound.low;
  
        /* simplify_using_initial_values does a copy propagation on the registers
  	 in the expression for the number of iterations.  This prolongs life
Index: gcc/loop-unroll.c
===================================================================
*** gcc/loop-unroll.c.orig	2011-12-02 10:14:44.000000000 +0100
--- gcc/loop-unroll.c	2012-02-23 15:26:46.000000000 +0100
*************** decide_unroll_runtime_iterations (struct
*** 859,865 ****
      }
  
    /* If we have profile feedback, check whether the loop rolls.  */
!   if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
      {
        if (dump_file)
  	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
--- 859,866 ----
      }
  
    /* If we have profile feedback, check whether the loop rolls.  */
!   if ((loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
!       || desc->niter_max < 2 * nunroll)
      {
        if (dump_file)
  	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
Index: gcc/tree-ssa-loop-niter.c
===================================================================
*** gcc/tree-ssa-loop-niter.c.orig	2011-09-01 12:08:51.000000000 +0200
--- gcc/tree-ssa-loop-niter.c	2012-02-23 14:56:11.000000000 +0100
*************** number_of_iterations_cond (struct loop *
*** 1383,1388 ****
--- 1383,1392 ----
        gcc_unreachable ();
      }
  
+   if (loop->any_upper_bound
+       && double_int_ucmp (loop->nb_iterations_upper_bound, niter->max) < 0)
+     niter->max = loop->nb_iterations_upper_bound;
+ 
    mpz_clear (bnds.up);
    mpz_clear (bnds.below);
  
*************** estimate_numbers_of_iterations_loop (str
*** 3030,3036 ****
    if (loop->estimate_state != EST_NOT_COMPUTED)
      return;
    loop->estimate_state = EST_AVAILABLE;
!   loop->any_upper_bound = false;
    loop->any_estimate = false;
  
    exits = get_loop_exit_edges (loop);
--- 3034,3040 ----
    if (loop->estimate_state != EST_NOT_COMPUTED)
      return;
    loop->estimate_state = EST_AVAILABLE;
!   /* loop->any_upper_bound = false; */
    loop->any_estimate = false;
  
    exits = get_loop_exit_edges (loop);
Index: gcc/tree-ssa-loop-prefetch.c
===================================================================
*** gcc/tree-ssa-loop-prefetch.c.orig	2011-10-12 13:14:10.000000000 +0200
--- gcc/tree-ssa-loop-prefetch.c	2012-02-23 15:05:45.000000000 +0100
*************** loop_prefetch_arrays (struct loop *loop)
*** 1801,1806 ****
--- 1801,1808 ----
  
    ahead = (PREFETCH_LATENCY + time - 1) / time;
    est_niter = max_stmt_executions_int (loop, false);
+   if (est_niter == -1)
+     est_niter = max_stmt_executions_int (loop, true);
  
    /* Prefetching is not likely to be profitable if the trip count to ahead
       ratio is too small.  */
Index: gcc/tree-vect-loop-manip.c
===================================================================
*** gcc/tree-vect-loop-manip.c.orig	2012-02-23 14:45:11.000000000 +0100
--- gcc/tree-vect-loop-manip.c	2012-02-23 14:45:18.000000000 +0100
*************** vect_do_peeling_for_alignment (loop_vec_
*** 2206,2211 ****
--- 2206,2217 ----
  #ifdef ENABLE_CHECKING
    slpeel_verify_cfg_after_peeling (new_loop, loop);
  #endif
+   new_loop->any_upper_bound = true;
+   new_loop->nb_iterations_upper_bound = uhwi_to_double_int (MAX (LOOP_VINFO_VECT_FACTOR (loop_vinfo), min_profitable_iters));
+   if (dump_file && (dump_flags & TDF_DETAILS))
+     fprintf (dump_file, "Setting upper bound of nb iterations for prologue "
+ 	     "loop to %d\n", MAX (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+ 				  min_profitable_iters));
  
    /* Update number of times loop executes.  */
    n_iters = LOOP_VINFO_NITERS (loop_vinfo);

Patch

Index: gcc/loop-iv.c
===================================================================
--- gcc/loop-iv.c.orig	2011-07-11 17:02:51.000000000 +0200
+++ gcc/loop-iv.c	2012-02-23 15:22:14.000000000 +0100
@@ -2764,6 +2764,10 @@  iv_number_of_iterations (struct loop *lo
     {
       if (!desc->niter_max)
 	desc->niter_max = determine_max_iter (loop, desc, old_niter);
+      if (loop->any_upper_bound
+	  && double_int_fits_in_uhwi_p (loop->nb_iterations_upper_bound)
+	  && loop->nb_iterations_upper_bound.low < desc->niter_max)
+	desc->niter_max = loop->nb_iterations_upper_bound.low;
 
       /* simplify_using_initial_values does a copy propagation on the registers
 	 in the expression for the number of iterations.  This prolongs life
Index: gcc/loop-unroll.c
===================================================================
--- gcc/loop-unroll.c.orig	2011-12-02 10:14:44.000000000 +0100
+++ gcc/loop-unroll.c	2012-02-23 15:26:46.000000000 +0100
@@ -859,7 +859,8 @@  decide_unroll_runtime_iterations (struct
     }
 
   /* If we have profile feedback, check whether the loop rolls.  */
-  if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
+  if ((loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
+      || desc->niter_max < 2 * nunroll)
     {
       if (dump_file)
 	fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
Index: gcc/tree-ssa-loop-niter.c
===================================================================
--- gcc/tree-ssa-loop-niter.c.orig	2011-09-01 12:08:51.000000000 +0200
+++ gcc/tree-ssa-loop-niter.c	2012-02-23 14:56:11.000000000 +0100
@@ -1383,6 +1383,10 @@  number_of_iterations_cond (struct loop *
       gcc_unreachable ();
     }
 
+  if (loop->any_upper_bound
+      && double_int_ucmp (loop->nb_iterations_upper_bound, niter->max) < 0)
+    niter->max = loop->nb_iterations_upper_bound;
+
   mpz_clear (bnds.up);
   mpz_clear (bnds.below);
 
@@ -3030,7 +3034,7 @@  estimate_numbers_of_iterations_loop (str
   if (loop->estimate_state != EST_NOT_COMPUTED)
     return;
   loop->estimate_state = EST_AVAILABLE;
-  loop->any_upper_bound = false;
+  /* loop->any_upper_bound = false; */
   loop->any_estimate = false;
 
   exits = get_loop_exit_edges (loop);
Index: gcc/tree-ssa-loop-prefetch.c
===================================================================
--- gcc/tree-ssa-loop-prefetch.c.orig	2011-10-12 13:14:10.000000000 +0200
+++ gcc/tree-ssa-loop-prefetch.c	2012-02-23 15:05:45.000000000 +0100
@@ -1801,6 +1801,8 @@  loop_prefetch_arrays (struct loop *loop)
 
   ahead = (PREFETCH_LATENCY + time - 1) / time;
   est_niter = max_stmt_executions_int (loop, false);
+  if (est_niter == -1)
+    est_niter = max_stmt_executions_int (loop, true);
 
   /* Prefetching is not likely to be profitable if the trip count to ahead
      ratio is too small.  */
Index: gcc/tree-vect-loop-manip.c
===================================================================
--- gcc/tree-vect-loop-manip.c.orig	2012-02-23 14:45:11.000000000 +0100
+++ gcc/tree-vect-loop-manip.c	2012-02-23 14:45:18.000000000 +0100
@@ -2206,6 +2206,12 @@  vect_do_peeling_for_alignment (loop_vec_
 #ifdef ENABLE_CHECKING
   slpeel_verify_cfg_after_peeling (new_loop, loop);
 #endif
+  new_loop->any_upper_bound = true;
+  new_loop->nb_iterations_upper_bound = uhwi_to_double_int (MAX (LOOP_VINFO_VECT_FACTOR (loop_vinfo), min_profitable_iters));
+  if (dump_file && (dump_flags & TDF_DETAILS))
+    fprintf (dump_file, "Setting upper bound of nb iterations for prologue "
+	     "loop to %d\n", MAX (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+				  min_profitable_iters));
 
   /* Update number of times loop executes.  */
   n_iters = LOOP_VINFO_NITERS (loop_vinfo);