Patchwork [0/n] tree LIM TLC - series part for backporting, limit LIM

login
register
mail settings
Submitter Richard Guenther
Date March 14, 2013, 1:39 p.m.
Message ID <alpine.LNX.2.00.1303141428080.3543@zhemvz.fhfr.qr>
Download mbox | patch
Permalink /patch/227667/
State New
Headers show

Comments

Richard Guenther - March 14, 2013, 1:39 p.m.
This extracts pieces from the already posted patch series that are
most worthwhile and applicable for backporting to both 4.8 and 4.7.
It also re-implements the limiting of the maximum number of memory
references to consider for LIMs dependence analysis.  This limiting
is now done per loop-nest and disables optimizing outer loops
only.  The limiting requires backporting introduction of the
shared unalalyzable mem-ref - it works by marking that as stored
in loops we do not want to compute dependences for - which makes
dependence computation for mems in those loops linear, as that
mem-ref, which conveniently has ID 0, is tested first.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

The current limit of 1000 datarefs is quite low (well, for LIMs
purposes, that is), and I only bothered to care about -O1 for
backports (no caching of the affine combination).  With the
limit in place and at -O1 LIM now takes

 tree loop invariant motion:   0.55 ( 1%) usr

for the testcase in PR39326.  Four patches in total, we might
consider not backporting the limiting, without it this
insane testcase has, at ~2GB memory usage (peak determined by IRA)

 tree loop invariant motion: 533.30 (77%) usr

but avoids running into the DSE / combine issue (and thus stays
managable overall at -O1).  With limiting it requires -fno-dse
to not blow up (>5GB of memory use).

Richard.

2013-03-12  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/39326
	* tree-ssa-loop-im.c (refs_independent_p): Exploit symmetry.



2013-03-14  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/39326
	* tree-ssa-loop-im.c (struct mem_ref): Replace mem member with
	ao_ref typed member.
	(MEM_ANALYZABLE): Adjust.
	(memref_eq): Likewise.
	(mem_ref_alloc): Likewise.
	(gather_mem_refs_stmt): Likewise.
	(mem_refs_may_alias_p): Use the ao_ref to query the alias oracle.
	(execute_sm_if_changed_flag_set): Adjust.
	(execute_sm): Likewise.
	(ref_always_accessed_p): Likewise.
	(refs_independent_p): Likewise.
	(can_sm_ref_p): Likewise.

Index: trunk/gcc/tree-ssa-loop-im.c
===================================================================
*** trunk.orig/gcc/tree-ssa-loop-im.c	2013-03-14 12:36:49.000000000 +0100
--- trunk/gcc/tree-ssa-loop-im.c	2013-03-14 12:43:16.180191012 +0100
*************** typedef struct mem_ref_locs
*** 117,126 ****
  
  typedef struct mem_ref
  {
-   tree mem;			/* The memory itself.  */
    unsigned id;			/* ID assigned to the memory reference
  				   (its index in memory_accesses.refs_list)  */
    hashval_t hash;		/* Its hash value.  */
    bitmap stored;		/* The set of loops in that this memory location
  				   is stored to.  */
    vec<mem_ref_locs_p> accesses_in_loop;
--- 117,130 ----
  
  typedef struct mem_ref
  {
    unsigned id;			/* ID assigned to the memory reference
  				   (its index in memory_accesses.refs_list)  */
    hashval_t hash;		/* Its hash value.  */
+ 
+   /* The memory access itself and associated caching of alias-oracle
+      query meta-data.  */
+   ao_ref mem;
+ 
    bitmap stored;		/* The set of loops in that this memory location
  				   is stored to.  */
    vec<mem_ref_locs_p> accesses_in_loop;
*************** static bool ref_indep_loop_p (struct loo
*** 186,192 ****
  #define SET_ALWAYS_EXECUTED_IN(BB, VAL) ((BB)->aux = (void *) (VAL))
  
  /* Whether the reference was analyzable.  */
! #define MEM_ANALYZABLE(REF) ((REF)->mem != error_mark_node)
  
  static struct lim_aux_data *
  init_lim_data (gimple stmt)
--- 190,196 ----
  #define SET_ALWAYS_EXECUTED_IN(BB, VAL) ((BB)->aux = (void *) (VAL))
  
  /* Whether the reference was analyzable.  */
! #define MEM_ANALYZABLE(REF) ((REF)->mem.ref != error_mark_node)
  
  static struct lim_aux_data *
  init_lim_data (gimple stmt)
*************** memref_eq (const void *obj1, const void
*** 1449,1455 ****
  {
    const struct mem_ref *const mem1 = (const struct mem_ref *) obj1;
  
!   return operand_equal_p (mem1->mem, (const_tree) obj2, 0);
  }
  
  /* Releases list of memory reference locations ACCS.  */
--- 1453,1459 ----
  {
    const struct mem_ref *const mem1 = (const struct mem_ref *) obj1;
  
!   return operand_equal_p (mem1->mem.ref, (const_tree) obj2, 0);
  }
  
  /* Releases list of memory reference locations ACCS.  */
*************** static mem_ref_p
*** 1491,1497 ****
  mem_ref_alloc (tree mem, unsigned hash, unsigned id)
  {
    mem_ref_p ref = XNEW (struct mem_ref);
!   ref->mem = mem;
    ref->id = id;
    ref->hash = hash;
    ref->stored = BITMAP_ALLOC (&lim_bitmap_obstack);
--- 1495,1501 ----
  mem_ref_alloc (tree mem, unsigned hash, unsigned id)
  {
    mem_ref_p ref = XNEW (struct mem_ref);
!   ao_ref_init (&ref->mem, mem);
    ref->id = id;
    ref->hash = hash;
    ref->stored = BITMAP_ALLOC (&lim_bitmap_obstack);
*************** gather_mem_refs_stmt (struct loop *loop,
*** 1606,1612 ****
        if (dump_file && (dump_flags & TDF_DETAILS))
  	{
  	  fprintf (dump_file, "Memory reference %u: ", id);
! 	  print_generic_expr (dump_file, ref->mem, TDF_SLIM);
  	  fprintf (dump_file, "\n");
  	}
      }
--- 1610,1616 ----
        if (dump_file && (dump_flags & TDF_DETAILS))
  	{
  	  fprintf (dump_file, "Memory reference %u: ", id);
! 	  print_generic_expr (dump_file, ref->mem.ref, TDF_SLIM);
  	  fprintf (dump_file, "\n");
  	}
      }
*************** analyze_memory_references (void)
*** 1730,1736 ****
     tree_to_aff_combination_expand.  */
  
  static bool
! mem_refs_may_alias_p (tree mem1, tree mem2, struct pointer_map_t **ttae_cache)
  {
    /* Perform BASE + OFFSET analysis -- if MEM1 and MEM2 are based on the same
       object and their offset differ in such a way that the locations cannot
--- 1734,1741 ----
     tree_to_aff_combination_expand.  */
  
  static bool
! mem_refs_may_alias_p (mem_ref_p mem1, mem_ref_p mem2,
! 		      struct pointer_map_t **ttae_cache)
  {
    /* Perform BASE + OFFSET analysis -- if MEM1 and MEM2 are based on the same
       object and their offset differ in such a way that the locations cannot
*************** mem_refs_may_alias_p (tree mem1, tree me
*** 1739,1745 ****
    aff_tree off1, off2;
  
    /* Perform basic offset and type-based disambiguation.  */
!   if (!refs_may_alias_p (mem1, mem2))
      return false;
  
    /* The expansion of addresses may be a bit expensive, thus we only do
--- 1744,1750 ----
    aff_tree off1, off2;
  
    /* Perform basic offset and type-based disambiguation.  */
!   if (!refs_may_alias_p_1 (&mem1->mem, &mem2->mem, true))
      return false;
  
    /* The expansion of addresses may be a bit expensive, thus we only do
*************** mem_refs_may_alias_p (tree mem1, tree me
*** 1747,1754 ****
    if (optimize < 2)
      return true;
  
!   get_inner_reference_aff (mem1, &off1, &size1);
!   get_inner_reference_aff (mem2, &off2, &size2);
    aff_combination_expand (&off1, ttae_cache);
    aff_combination_expand (&off2, ttae_cache);
    aff_combination_scale (&off1, double_int_minus_one);
--- 1752,1759 ----
    if (optimize < 2)
      return true;
  
!   get_inner_reference_aff (mem1->mem.ref, &off1, &size1);
!   get_inner_reference_aff (mem2->mem.ref, &off2, &size2);
    aff_combination_expand (&off1, ttae_cache);
    aff_combination_expand (&off2, ttae_cache);
    aff_combination_scale (&off1, double_int_minus_one);
*************** execute_sm_if_changed_flag_set (struct l
*** 2079,2085 ****
    mem_ref_loc_p loc;
    tree flag;
    vec<mem_ref_loc_p> locs = vNULL;
!   char *str = get_lsm_tmp_name (ref->mem, ~0);
  
    lsm_tmp_name_add ("_flag");
    flag = create_tmp_reg (boolean_type_node, str);
--- 2084,2090 ----
    mem_ref_loc_p loc;
    tree flag;
    vec<mem_ref_loc_p> locs = vNULL;
!   char *str = get_lsm_tmp_name (ref->mem.ref, ~0);
  
    lsm_tmp_name_add ("_flag");
    flag = create_tmp_reg (boolean_type_node, str);
*************** execute_sm (struct loop *loop, vec<edge>
*** 2121,2136 ****
    if (dump_file && (dump_flags & TDF_DETAILS))
      {
        fprintf (dump_file, "Executing store motion of ");
!       print_generic_expr (dump_file, ref->mem, 0);
        fprintf (dump_file, " from loop %d\n", loop->num);
      }
  
!   tmp_var = create_tmp_reg (TREE_TYPE (ref->mem),
! 			      get_lsm_tmp_name (ref->mem, ~0));
  
    fmt_data.loop = loop;
    fmt_data.orig_loop = loop;
!   for_each_index (&ref->mem, force_move_till, &fmt_data);
  
    if (block_in_transaction (loop_preheader_edge (loop)->src)
        || !PARAM_VALUE (PARAM_ALLOW_STORE_DATA_RACES))
--- 2126,2141 ----
    if (dump_file && (dump_flags & TDF_DETAILS))
      {
        fprintf (dump_file, "Executing store motion of ");
!       print_generic_expr (dump_file, ref->mem.ref, 0);
        fprintf (dump_file, " from loop %d\n", loop->num);
      }
  
!   tmp_var = create_tmp_reg (TREE_TYPE (ref->mem.ref),
! 			    get_lsm_tmp_name (ref->mem.ref, ~0));
  
    fmt_data.loop = loop;
    fmt_data.orig_loop = loop;
!   for_each_index (&ref->mem.ref, force_move_till, &fmt_data);
  
    if (block_in_transaction (loop_preheader_edge (loop)->src)
        || !PARAM_VALUE (PARAM_ALLOW_STORE_DATA_RACES))
*************** execute_sm (struct loop *loop, vec<edge>
*** 2148,2154 ****
    /* FIXME/TODO: For the multi-threaded variant, we could avoid this
       load altogether, since the store is predicated by a flag.  We
       could, do the load only if it was originally in the loop.  */
!   load = gimple_build_assign (tmp_var, unshare_expr (ref->mem));
    lim_data = init_lim_data (load);
    lim_data->max_loop = loop;
    lim_data->tgt_loop = loop;
--- 2153,2159 ----
    /* FIXME/TODO: For the multi-threaded variant, we could avoid this
       load altogether, since the store is predicated by a flag.  We
       could, do the load only if it was originally in the loop.  */
!   load = gimple_build_assign (tmp_var, unshare_expr (ref->mem.ref));
    lim_data = init_lim_data (load);
    lim_data->max_loop = loop;
    lim_data->tgt_loop = loop;
*************** execute_sm (struct loop *loop, vec<edge>
*** 2168,2178 ****
      if (!multi_threaded_model_p)
        {
  	gimple store;
! 	store = gimple_build_assign (unshare_expr (ref->mem), tmp_var);
  	gsi_insert_on_edge (ex, store);
        }
      else
!       execute_sm_if_changed (ex, ref->mem, tmp_var, store_flag);
  }
  
  /* Hoists memory references MEM_REFS out of LOOP.  EXITS is the list of exit
--- 2173,2183 ----
      if (!multi_threaded_model_p)
        {
  	gimple store;
! 	store = gimple_build_assign (unshare_expr (ref->mem.ref), tmp_var);
  	gsi_insert_on_edge (ex, store);
        }
      else
!       execute_sm_if_changed (ex, ref->mem.ref, tmp_var, store_flag);
  }
  
  /* Hoists memory references MEM_REFS out of LOOP.  EXITS is the list of exit
*************** ref_always_accessed_p (struct loop *loop
*** 2206,2214 ****
    struct loop *must_exec;
    tree base;
  
!   base = get_base_address (ref->mem);
!   if (INDIRECT_REF_P (base)
!       || TREE_CODE (base) == MEM_REF)
      base = TREE_OPERAND (base, 0);
  
    get_all_locs_in_loop (loop, ref, &locs);
--- 2211,2218 ----
    struct loop *must_exec;
    tree base;
  
!   base = ao_ref_base (&ref->mem);
!   if (TREE_CODE (base) == MEM_REF)
      base = TREE_OPERAND (base, 0);
  
    get_all_locs_in_loop (loop, ref, &locs);
*************** refs_independent_p (mem_ref_p ref1, mem_
*** 2279,2286 ****
      fprintf (dump_file, "Querying dependency of refs %u and %u: ",
  	     ref1->id, ref2->id);
  
!   if (mem_refs_may_alias_p (ref1->mem, ref2->mem,
! 			    &memory_accesses.ttae_cache))
      {
        bitmap_set_bit (ref1->dep_ref, ref2->id);
        if (dump_file && (dump_flags & TDF_DETAILS))
--- 2283,2289 ----
      fprintf (dump_file, "Querying dependency of refs %u and %u: ",
  	     ref1->id, ref2->id);
  
!   if (mem_refs_may_alias_p (ref1, ref2, &memory_accesses.ttae_cache))
      {
        bitmap_set_bit (ref1->dep_ref, ref2->id);
        if (dump_file && (dump_flags & TDF_DETAILS))
*************** can_sm_ref_p (struct loop *loop, mem_ref
*** 2380,2400 ****
      return false;
  
    /* It should be movable.  */
!   if (!is_gimple_reg_type (TREE_TYPE (ref->mem))
!       || TREE_THIS_VOLATILE (ref->mem)
!       || !for_each_index (&ref->mem, may_move_till, loop))
      return false;
  
    /* If it can throw fail, we do not properly update EH info.  */
!   if (tree_could_throw_p (ref->mem))
      return false;
  
    /* If it can trap, it must be always executed in LOOP.
       Readonly memory locations may trap when storing to them, but
       tree_could_trap_p is a predicate for rvalues, so check that
       explicitly.  */
!   base = get_base_address (ref->mem);
!   if ((tree_could_trap_p (ref->mem)
         || (DECL_P (base) && TREE_READONLY (base)))
        && !ref_always_accessed_p (loop, ref, true))
      return false;
--- 2383,2403 ----
      return false;
  
    /* It should be movable.  */
!   if (!is_gimple_reg_type (TREE_TYPE (ref->mem.ref))
!       || TREE_THIS_VOLATILE (ref->mem.ref)
!       || !for_each_index (&ref->mem.ref, may_move_till, loop))
      return false;
  
    /* If it can throw fail, we do not properly update EH info.  */
!   if (tree_could_throw_p (ref->mem.ref))
      return false;
  
    /* If it can trap, it must be always executed in LOOP.
       Readonly memory locations may trap when storing to them, but
       tree_could_trap_p is a predicate for rvalues, so check that
       explicitly.  */
!   base = get_base_address (ref->mem.ref);
!   if ((tree_could_trap_p (ref->mem.ref)
         || (DECL_P (base) && TREE_READONLY (base)))
        && !ref_always_accessed_p (loop, ref, true))
      return false;


2013-03-14  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/39326
	* tree-ssa-loop-im.c (UNANALYZABLE_MEM_ID): New define.
	(MEM_ANALYZABLE): Adjust.
	(record_mem_ref_loc): Move bitmap ops ...
	(gather_mem_refs_stmt): ... here.  Use the shared mem-ref for
	unanalyzable refs, do not record locations for it.
	(analyze_memory_references): Allocate ref zero as shared
	unanalyzable ref.

Index: trunk/gcc/tree-ssa-loop-im.c
===================================================================
*** trunk.orig/gcc/tree-ssa-loop-im.c	2013-03-14 12:43:16.000000000 +0100
--- trunk/gcc/tree-ssa-loop-im.c	2013-03-14 12:52:37.218747012 +0100
*************** static bool ref_indep_loop_p (struct loo
*** 189,196 ****
  #define ALWAYS_EXECUTED_IN(BB) ((struct loop *) (BB)->aux)
  #define SET_ALWAYS_EXECUTED_IN(BB, VAL) ((BB)->aux = (void *) (VAL))
  
  /* Whether the reference was analyzable.  */
! #define MEM_ANALYZABLE(REF) ((REF)->mem.ref != error_mark_node)
  
  static struct lim_aux_data *
  init_lim_data (gimple stmt)
--- 189,199 ----
  #define ALWAYS_EXECUTED_IN(BB) ((struct loop *) (BB)->aux)
  #define SET_ALWAYS_EXECUTED_IN(BB, VAL) ((BB)->aux = (void *) (VAL))
  
+ /* ID of the shared unanalyzable mem.  */
+ #define UNANALYZABLE_MEM_ID 0
+ 
  /* Whether the reference was analyzable.  */
! #define MEM_ANALYZABLE(REF) ((REF)->id != UNANALYZABLE_MEM_ID)
  
  static struct lim_aux_data *
  init_lim_data (gimple stmt)
*************** record_mem_ref_loc (mem_ref_p ref, struc
*** 1526,1532 ****
  {
    mem_ref_loc_p aref = XNEW (struct mem_ref_loc);
    mem_ref_locs_p accs;
-   bitmap ril = memory_accesses.refs_in_loop[loop->num];
  
    if (ref->accesses_in_loop.length ()
        <= (unsigned) loop->num)
--- 1529,1534 ----
*************** record_mem_ref_loc (mem_ref_p ref, struc
*** 1542,1548 ****
    aref->ref = loc;
  
    accs->locs.safe_push (aref);
-   bitmap_set_bit (ril, ref->id);
  }
  
  /* Marks reference REF as stored in LOOP.  */
--- 1544,1549 ----
*************** gather_mem_refs_stmt (struct loop *loop,
*** 1578,1624 ****
    mem = simple_mem_ref_in_stmt (stmt, &is_stored);
    if (!mem)
      {
!       id = memory_accesses.refs_list.length ();
!       ref = mem_ref_alloc (error_mark_node, 0, id);
!       memory_accesses.refs_list.safe_push (ref);
        if (dump_file && (dump_flags & TDF_DETAILS))
  	{
  	  fprintf (dump_file, "Unanalyzed memory reference %u: ", id);
  	  print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
  	}
!       if (gimple_vdef (stmt))
! 	mark_ref_stored (ref, loop);
!       record_mem_ref_loc (ref, loop, stmt, mem);
!       return;
!     }
! 
!   hash = iterative_hash_expr (*mem, 0);
!   slot = htab_find_slot_with_hash (memory_accesses.refs, *mem, hash, INSERT);
! 
!   if (*slot)
!     {
!       ref = (mem_ref_p) *slot;
!       id = ref->id;
      }
    else
      {
!       id = memory_accesses.refs_list.length ();
!       ref = mem_ref_alloc (*mem, hash, id);
!       memory_accesses.refs_list.safe_push (ref);
!       *slot = ref;
! 
!       if (dump_file && (dump_flags & TDF_DETAILS))
  	{
! 	  fprintf (dump_file, "Memory reference %u: ", id);
! 	  print_generic_expr (dump_file, ref->mem.ref, TDF_SLIM);
! 	  fprintf (dump_file, "\n");
  	}
-     }
  
    if (is_stored)
      mark_ref_stored (ref, loop);
- 
-   record_mem_ref_loc (ref, loop, stmt, mem);
    return;
  }
  
--- 1579,1624 ----
    mem = simple_mem_ref_in_stmt (stmt, &is_stored);
    if (!mem)
      {
!       /* We use the shared mem_ref for all unanalyzable refs.  */
!       id = UNANALYZABLE_MEM_ID;
!       ref = memory_accesses.refs_list[id];
        if (dump_file && (dump_flags & TDF_DETAILS))
  	{
  	  fprintf (dump_file, "Unanalyzed memory reference %u: ", id);
  	  print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
  	}
!       is_stored = gimple_vdef (stmt);
      }
    else
      {
!       hash = iterative_hash_expr (*mem, 0);
!       slot = htab_find_slot_with_hash (memory_accesses.refs,
! 				       *mem, hash, INSERT);
!       if (*slot)
  	{
! 	  ref = (mem_ref_p) *slot;
! 	  id = ref->id;
! 	}
!       else
! 	{
! 	  id = memory_accesses.refs_list.length ();
! 	  ref = mem_ref_alloc (*mem, hash, id);
! 	  memory_accesses.refs_list.safe_push (ref);
! 	  *slot = ref;
! 
! 	  if (dump_file && (dump_flags & TDF_DETAILS))
! 	    {
! 	      fprintf (dump_file, "Memory reference %u: ", id);
! 	      print_generic_expr (dump_file, ref->mem.ref, TDF_SLIM);
! 	      fprintf (dump_file, "\n");
! 	    }
  	}
  
+       record_mem_ref_loc (ref, loop, stmt, mem);
+     }
+   bitmap_set_bit (memory_accesses.refs_in_loop[loop->num], ref->id);
    if (is_stored)
      mark_ref_stored (ref, loop);
    return;
  }
  
*************** analyze_memory_references (void)
*** 1709,1715 ****
    bitmap empty;
  
    memory_accesses.refs = htab_create (100, memref_hash, memref_eq, NULL);
!   memory_accesses.refs_list.create (0);
    memory_accesses.refs_in_loop.create (number_of_loops ());
    memory_accesses.all_refs_in_loop.create (number_of_loops ());
    memory_accesses.all_refs_stored_in_loop.create (number_of_loops ());
--- 1709,1719 ----
    bitmap empty;
  
    memory_accesses.refs = htab_create (100, memref_hash, memref_eq, NULL);
!   memory_accesses.refs_list.create (100);
!   /* Allocate a special, unanalyzable mem-ref with ID zero.  */
!   memory_accesses.refs_list.quick_push
!     (mem_ref_alloc (error_mark_node, 0, UNANALYZABLE_MEM_ID));
! 
    memory_accesses.refs_in_loop.create (number_of_loops ());
    memory_accesses.all_refs_in_loop.create (number_of_loops ());
    memory_accesses.all_refs_stored_in_loop.create (number_of_loops ());


2013-03-14  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/39326
	* tree-ssa-loop-im.c: Include diagnostic-core.h.
	(mark_ref_stored): Optimize.
	(gather_mem_refs_stmt): Also set all_refs_stored_bit if stored.
	(create_vop_ref_mapping_loop, create_vop_ref_mapping): Remove
	and fold functionality into ...
	(gather_mem_refs_in_loops): ... this.  Iterate over loops,
	counting memory references and punting when more than
	--param loop-max-datarefs-for-datadeps.
	(analyze_memory_references): Adjust.

Index: trunk/gcc/tree-ssa-loop-im.c
===================================================================
*** trunk.orig/gcc/tree-ssa-loop-im.c	2013-03-14 12:52:37.000000000 +0100
--- trunk/gcc/tree-ssa-loop-im.c	2013-03-14 14:23:47.533164359 +0100
*************** along with GCC; see the file COPYING3.
*** 20,25 ****
--- 20,26 ----
  #include "config.h"
  #include "system.h"
  #include "coretypes.h"
+ #include "diagnostic-core.h"
  #include "tm.h"
  #include "tree.h"
  #include "tm_p.h"
*************** record_mem_ref_loc (mem_ref_p ref, struc
*** 1551,1561 ****
  static void
  mark_ref_stored (mem_ref_p ref, struct loop *loop)
  {
!   for (;
!        loop != current_loops->tree_root
!        && !bitmap_bit_p (ref->stored, loop->num);
!        loop = loop_outer (loop))
!     bitmap_set_bit (ref->stored, loop->num);
  }
  
  /* Gathers memory references in statement STMT in LOOP, storing the
--- 1552,1560 ----
  static void
  mark_ref_stored (mem_ref_p ref, struct loop *loop)
  {
!   while (loop != current_loops->tree_root
! 	 && bitmap_set_bit (ref->stored, loop->num))
!     loop = loop_outer (loop);
  }
  
  /* Gathers memory references in statement STMT in LOOP, storing the
*************** gather_mem_refs_stmt (struct loop *loop,
*** 1618,1624 ****
      }
    bitmap_set_bit (memory_accesses.refs_in_loop[loop->num], ref->id);
    if (is_stored)
!     mark_ref_stored (ref, loop);
    return;
  }
  
--- 1617,1627 ----
      }
    bitmap_set_bit (memory_accesses.refs_in_loop[loop->num], ref->id);
    if (is_stored)
!     {
!       bitmap_set_bit (memory_accesses.all_refs_stored_in_loop[loop->num],
! 		      ref->id);
!       mark_ref_stored (ref, loop);
!     }
    return;
  }
  
*************** gather_mem_refs_stmt (struct loop *loop,
*** 1627,1704 ****
  static void
  gather_mem_refs_in_loops (void)
  {
-   gimple_stmt_iterator bsi;
-   basic_block bb;
    struct loop *loop;
    loop_iterator li;
-   bitmap lrefs, alrefs, alrefso;
- 
-   FOR_EACH_BB (bb)
-     {
-       loop = bb->loop_father;
-       if (loop == current_loops->tree_root)
- 	continue;
  
!       for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
! 	gather_mem_refs_stmt (loop, gsi_stmt (bsi));
!     }
  
    /* Propagate the information about accessed memory references up
       the loop hierarchy.  */
    FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
      {
!       lrefs = memory_accesses.refs_in_loop[loop->num];
!       alrefs = memory_accesses.all_refs_in_loop[loop->num];
!       bitmap_ior_into (alrefs, lrefs);
  
!       if (loop_outer (loop) == current_loops->tree_root)
  	continue;
  
!       alrefso = memory_accesses.all_refs_in_loop[loop_outer (loop)->num];
!       bitmap_ior_into (alrefso, alrefs);
      }
- }
- 
- /* Create a mapping from virtual operands to references that touch them
-    in LOOP.  */
- 
- static void
- create_vop_ref_mapping_loop (struct loop *loop)
- {
-   bitmap refs = memory_accesses.refs_in_loop[loop->num];
-   struct loop *sloop;
-   bitmap_iterator bi;
-   unsigned i;
-   mem_ref_p ref;
  
!   EXECUTE_IF_SET_IN_BITMAP (refs, 0, i, bi)
!     {
!       ref = memory_accesses.refs_list[i];
!       for (sloop = loop; sloop != current_loops->tree_root;
! 	   sloop = loop_outer (sloop))
! 	if (bitmap_bit_p (ref->stored, loop->num))
! 	  {
! 	    bitmap refs_stored
! 	      = memory_accesses.all_refs_stored_in_loop[sloop->num];
! 	    bitmap_set_bit (refs_stored, ref->id);
! 	  }
!     }
  }
  
- /* For each non-clobbered virtual operand and each loop, record the memory
-    references in this loop that touch the operand.  */
- 
- static void
- create_vop_ref_mapping (void)
- {
-   loop_iterator li;
-   struct loop *loop;
- 
-   FOR_EACH_LOOP (li, loop, 0)
-     {
-       create_vop_ref_mapping_loop (loop);
-     }
- }
  
  /* Gathers information about memory accesses in the loops.  */
  
--- 1630,1707 ----
  static void
  gather_mem_refs_in_loops (void)
  {
    struct loop *loop;
    loop_iterator li;
  
!   unsigned *count = XCNEWVEC (unsigned, number_of_loops ());
  
    /* Propagate the information about accessed memory references up
       the loop hierarchy.  */
    FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
      {
!       basic_block *bbs = get_loop_body (loop);
!       struct loop *outer;
!       unsigned i;
! 
!       for (i = 0; i < loop->num_nodes; ++i)
! 	{
! 	  basic_block bb = bbs[i];
! 	  gimple_stmt_iterator gsi;
! 	  if (bb->loop_father != loop)
! 	    continue;
! 	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
! 	    gather_mem_refs_stmt (loop, gsi_stmt (gsi));
! 	}
!       free (bbs);
  
!       /* Finalize the overall numbers (including subloops) for this loop.  */
!       count[loop->num]
! 	+= bitmap_count_bits (memory_accesses.refs_in_loop[loop->num]);
! 
!       /* If there are too many memory references in this loop and its
! 	 sub-loops such that dependence testing would blow up compile-time
! 	 drop a unanalyzed store into the list of memory references to
! 	 get to the early-out.  */
!       if ((count[loop->num]
! 	   > (unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
! 	  && bitmap_set_bit
! 	       (memory_accesses.all_refs_stored_in_loop[loop->num],
! 		UNANALYZABLE_MEM_ID))
! 	{
! 	  bitmap_set_bit (memory_accesses.refs_in_loop[loop->num],
! 			  UNANALYZABLE_MEM_ID);
! 	  mark_ref_stored (memory_accesses.refs_list[UNANALYZABLE_MEM_ID],
! 			   loop);
! 	  if (dump_file && (dump_flags & TDF_DETAILS))
! 	    fprintf (dump_file, "Too many memory references in loop %d and its "
! 		     "super-loops, giving up\n", loop->num);
! 	  warning_at (gimple_location (gsi_stmt (gsi_start_bb (loop->header))),
! 		      OPT_Wdisabled_optimization,
! 		      "-ftree-loop-im: number of memory references in loop "
! 		      "exceeds the --param loops-max-datarefs-for-datadeps "
! 		      "threshold");
! 	}
! 
!       /* Finalize the overall touched references (including subloops).  */
!       bitmap_ior_into (memory_accesses.all_refs_in_loop[loop->num],
! 		       memory_accesses.refs_in_loop[loop->num]);
! 
!       /* Propagate the information about accessed memory references up
! 	 the loop hierarchy.  */
!       outer = loop_outer (loop);
!       if (outer == current_loops->tree_root)
  	continue;
  
!       bitmap_ior_into (memory_accesses.all_refs_in_loop[outer->num],
! 		       memory_accesses.all_refs_in_loop[loop->num]);
!       bitmap_ior_into (memory_accesses.all_refs_stored_in_loop[outer->num],
! 		       memory_accesses.all_refs_stored_in_loop[loop->num]);
!       count[outer->num] += count[loop->num];
      }
  
!   free (count);
  }
  
  
  /* Gathers information about memory accesses in the loops.  */
  
*************** analyze_memory_references (void)
*** 1731,1737 ****
    memory_accesses.ttae_cache = NULL;
  
    gather_mem_refs_in_loops ();
-   create_vop_ref_mapping ();
  }
  
  /* Returns true if MEM1 and MEM2 may alias.  TTAE_CACHE is used as a cache in
--- 1734,1739 ----
Richard Guenther - March 15, 2013, 10:36 a.m.
On Thu, 14 Mar 2013, Richard Biener wrote:

> 
> This extracts pieces from the already posted patch series that are
> most worthwhile and applicable for backporting to both 4.8 and 4.7.
> It also re-implements the limiting of the maximum number of memory
> references to consider for LIMs dependence analysis.  This limiting
> is now done per loop-nest and disables optimizing outer loops
> only.  The limiting requires backporting introduction of the
> shared unalalyzable mem-ref - it works by marking that as stored
> in loops we do not want to compute dependences for - which makes
> dependence computation for mems in those loops linear, as that
> mem-ref, which conveniently has ID 0, is tested first.
> 
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
> 
> The current limit of 1000 datarefs is quite low (well, for LIMs
> purposes, that is), and I only bothered to care about -O1 for
> backports (no caching of the affine combination).  With the
> limit in place and at -O1 LIM now takes
> 
>  tree loop invariant motion:   0.55 ( 1%) usr
> 
> for the testcase in PR39326.  Four patches in total, we might
> consider not backporting the limiting, without it this
> insane testcase has, at ~2GB memory usage (peak determined by IRA)
> 
>  tree loop invariant motion: 533.30 (77%) usr
> 
> but avoids running into the DSE / combine issue (and thus stays
> managable overall at -O1).  With limiting it requires -fno-dse
> to not blow up (>5GB of memory use).

Note that the limiting patch (below) causes code-generation differences
because it collects memory-references in a different order and
store-motion applies its transform in order of mem-ref IDs
(different order of loads / stores and different decl UIDs).  The
different ordering results in quite a big speedup because bitmaps
have a more regular form (maybe only for this testcase though).

Richard.

> 2013-03-14  Richard Biener  <rguenther@suse.de>
> 
> 	PR tree-optimization/39326
> 	* tree-ssa-loop-im.c: Include diagnostic-core.h.
> 	(mark_ref_stored): Optimize.
> 	(gather_mem_refs_stmt): Also set all_refs_stored_bit if stored.
> 	(create_vop_ref_mapping_loop, create_vop_ref_mapping): Remove
> 	and fold functionality into ...
> 	(gather_mem_refs_in_loops): ... this.  Iterate over loops,
> 	counting memory references and punting when more than
> 	--param loop-max-datarefs-for-datadeps.
> 	(analyze_memory_references): Adjust.
> 
> Index: trunk/gcc/tree-ssa-loop-im.c
> ===================================================================
> *** trunk.orig/gcc/tree-ssa-loop-im.c	2013-03-14 12:52:37.000000000 +0100
> --- trunk/gcc/tree-ssa-loop-im.c	2013-03-14 14:23:47.533164359 +0100
> *************** along with GCC; see the file COPYING3.
> *** 20,25 ****
> --- 20,26 ----
>   #include "config.h"
>   #include "system.h"
>   #include "coretypes.h"
> + #include "diagnostic-core.h"
>   #include "tm.h"
>   #include "tree.h"
>   #include "tm_p.h"
> *************** record_mem_ref_loc (mem_ref_p ref, struc
> *** 1551,1561 ****
>   static void
>   mark_ref_stored (mem_ref_p ref, struct loop *loop)
>   {
> !   for (;
> !        loop != current_loops->tree_root
> !        && !bitmap_bit_p (ref->stored, loop->num);
> !        loop = loop_outer (loop))
> !     bitmap_set_bit (ref->stored, loop->num);
>   }
>   
>   /* Gathers memory references in statement STMT in LOOP, storing the
> --- 1552,1560 ----
>   static void
>   mark_ref_stored (mem_ref_p ref, struct loop *loop)
>   {
> !   while (loop != current_loops->tree_root
> ! 	 && bitmap_set_bit (ref->stored, loop->num))
> !     loop = loop_outer (loop);
>   }
>   
>   /* Gathers memory references in statement STMT in LOOP, storing the
> *************** gather_mem_refs_stmt (struct loop *loop,
> *** 1618,1624 ****
>       }
>     bitmap_set_bit (memory_accesses.refs_in_loop[loop->num], ref->id);
>     if (is_stored)
> !     mark_ref_stored (ref, loop);
>     return;
>   }
>   
> --- 1617,1627 ----
>       }
>     bitmap_set_bit (memory_accesses.refs_in_loop[loop->num], ref->id);
>     if (is_stored)
> !     {
> !       bitmap_set_bit (memory_accesses.all_refs_stored_in_loop[loop->num],
> ! 		      ref->id);
> !       mark_ref_stored (ref, loop);
> !     }
>     return;
>   }
>   
> *************** gather_mem_refs_stmt (struct loop *loop,
> *** 1627,1704 ****
>   static void
>   gather_mem_refs_in_loops (void)
>   {
> -   gimple_stmt_iterator bsi;
> -   basic_block bb;
>     struct loop *loop;
>     loop_iterator li;
> -   bitmap lrefs, alrefs, alrefso;
> - 
> -   FOR_EACH_BB (bb)
> -     {
> -       loop = bb->loop_father;
> -       if (loop == current_loops->tree_root)
> - 	continue;
>   
> !       for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
> ! 	gather_mem_refs_stmt (loop, gsi_stmt (bsi));
> !     }
>   
>     /* Propagate the information about accessed memory references up
>        the loop hierarchy.  */
>     FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
>       {
> !       lrefs = memory_accesses.refs_in_loop[loop->num];
> !       alrefs = memory_accesses.all_refs_in_loop[loop->num];
> !       bitmap_ior_into (alrefs, lrefs);
>   
> !       if (loop_outer (loop) == current_loops->tree_root)
>   	continue;
>   
> !       alrefso = memory_accesses.all_refs_in_loop[loop_outer (loop)->num];
> !       bitmap_ior_into (alrefso, alrefs);
>       }
> - }
> - 
> - /* Create a mapping from virtual operands to references that touch them
> -    in LOOP.  */
> - 
> - static void
> - create_vop_ref_mapping_loop (struct loop *loop)
> - {
> -   bitmap refs = memory_accesses.refs_in_loop[loop->num];
> -   struct loop *sloop;
> -   bitmap_iterator bi;
> -   unsigned i;
> -   mem_ref_p ref;
>   
> !   EXECUTE_IF_SET_IN_BITMAP (refs, 0, i, bi)
> !     {
> !       ref = memory_accesses.refs_list[i];
> !       for (sloop = loop; sloop != current_loops->tree_root;
> ! 	   sloop = loop_outer (sloop))
> ! 	if (bitmap_bit_p (ref->stored, loop->num))
> ! 	  {
> ! 	    bitmap refs_stored
> ! 	      = memory_accesses.all_refs_stored_in_loop[sloop->num];
> ! 	    bitmap_set_bit (refs_stored, ref->id);
> ! 	  }
> !     }
>   }
>   
> - /* For each non-clobbered virtual operand and each loop, record the memory
> -    references in this loop that touch the operand.  */
> - 
> - static void
> - create_vop_ref_mapping (void)
> - {
> -   loop_iterator li;
> -   struct loop *loop;
> - 
> -   FOR_EACH_LOOP (li, loop, 0)
> -     {
> -       create_vop_ref_mapping_loop (loop);
> -     }
> - }
>   
>   /* Gathers information about memory accesses in the loops.  */
>   
> --- 1630,1707 ----
>   static void
>   gather_mem_refs_in_loops (void)
>   {
>     struct loop *loop;
>     loop_iterator li;
>   
> !   unsigned *count = XCNEWVEC (unsigned, number_of_loops ());
>   
>     /* Propagate the information about accessed memory references up
>        the loop hierarchy.  */
>     FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
>       {
> !       basic_block *bbs = get_loop_body (loop);
> !       struct loop *outer;
> !       unsigned i;
> ! 
> !       for (i = 0; i < loop->num_nodes; ++i)
> ! 	{
> ! 	  basic_block bb = bbs[i];
> ! 	  gimple_stmt_iterator gsi;
> ! 	  if (bb->loop_father != loop)
> ! 	    continue;
> ! 	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> ! 	    gather_mem_refs_stmt (loop, gsi_stmt (gsi));
> ! 	}
> !       free (bbs);
>   
> !       /* Finalize the overall numbers (including subloops) for this loop.  */
> !       count[loop->num]
> ! 	+= bitmap_count_bits (memory_accesses.refs_in_loop[loop->num]);
> ! 
> !       /* If there are too many memory references in this loop and its
> ! 	 sub-loops such that dependence testing would blow up compile-time
> ! 	 drop a unanalyzed store into the list of memory references to
> ! 	 get to the early-out.  */
> !       if ((count[loop->num]
> ! 	   > (unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
> ! 	  && bitmap_set_bit
> ! 	       (memory_accesses.all_refs_stored_in_loop[loop->num],
> ! 		UNANALYZABLE_MEM_ID))
> ! 	{
> ! 	  bitmap_set_bit (memory_accesses.refs_in_loop[loop->num],
> ! 			  UNANALYZABLE_MEM_ID);
> ! 	  mark_ref_stored (memory_accesses.refs_list[UNANALYZABLE_MEM_ID],
> ! 			   loop);
> ! 	  if (dump_file && (dump_flags & TDF_DETAILS))
> ! 	    fprintf (dump_file, "Too many memory references in loop %d and its "
> ! 		     "super-loops, giving up\n", loop->num);
> ! 	  warning_at (gimple_location (gsi_stmt (gsi_start_bb (loop->header))),
> ! 		      OPT_Wdisabled_optimization,
> ! 		      "-ftree-loop-im: number of memory references in loop "
> ! 		      "exceeds the --param loops-max-datarefs-for-datadeps "
> ! 		      "threshold");
> ! 	}
> ! 
> !       /* Finalize the overall touched references (including subloops).  */
> !       bitmap_ior_into (memory_accesses.all_refs_in_loop[loop->num],
> ! 		       memory_accesses.refs_in_loop[loop->num]);
> ! 
> !       /* Propagate the information about accessed memory references up
> ! 	 the loop hierarchy.  */
> !       outer = loop_outer (loop);
> !       if (outer == current_loops->tree_root)
>   	continue;
>   
> !       bitmap_ior_into (memory_accesses.all_refs_in_loop[outer->num],
> ! 		       memory_accesses.all_refs_in_loop[loop->num]);
> !       bitmap_ior_into (memory_accesses.all_refs_stored_in_loop[outer->num],
> ! 		       memory_accesses.all_refs_stored_in_loop[loop->num]);
> !       count[outer->num] += count[loop->num];
>       }
>   
> !   free (count);
>   }
>   
>   
>   /* Gathers information about memory accesses in the loops.  */
>   
> *************** analyze_memory_references (void)
> *** 1731,1737 ****
>     memory_accesses.ttae_cache = NULL;
>   
>     gather_mem_refs_in_loops ();
> -   create_vop_ref_mapping ();
>   }
>   
>   /* Returns true if MEM1 and MEM2 may alias.  TTAE_CACHE is used as a cache in
> --- 1734,1739 ----
>
Richard Guenther - March 18, 2013, 8:43 a.m.
On Fri, 15 Mar 2013, Richard Biener wrote:

> On Thu, 14 Mar 2013, Richard Biener wrote:
> 
> > 
> > This extracts pieces from the already posted patch series that are
> > most worthwhile and applicable for backporting to both 4.8 and 4.7.
> > It also re-implements the limiting of the maximum number of memory
> > references to consider for LIMs dependence analysis.  This limiting
> > is now done per loop-nest and disables optimizing outer loops
> > only.  The limiting requires backporting introduction of the
> > shared unalalyzable mem-ref - it works by marking that as stored
> > in loops we do not want to compute dependences for - which makes
> > dependence computation for mems in those loops linear, as that
> > mem-ref, which conveniently has ID 0, is tested first.
> > 
> > Bootstrapped and tested on x86_64-unknown-linux-gnu.
> > 
> > The current limit of 1000 datarefs is quite low (well, for LIMs
> > purposes, that is), and I only bothered to care about -O1 for
> > backports (no caching of the affine combination).  With the
> > limit in place and at -O1 LIM now takes
> > 
> >  tree loop invariant motion:   0.55 ( 1%) usr
> > 
> > for the testcase in PR39326.  Four patches in total, we might
> > consider not backporting the limiting, without it this
> > insane testcase has, at ~2GB memory usage (peak determined by IRA)
> > 
> >  tree loop invariant motion: 533.30 (77%) usr
> > 
> > but avoids running into the DSE / combine issue (and thus stays
> > managable overall at -O1).  With limiting it requires -fno-dse
> > to not blow up (>5GB of memory use).
> 
> Note that the limiting patch (below) causes code-generation differences
> because it collects memory-references in a different order and
> store-motion applies its transform in order of mem-ref IDs
> (different order of loads / stores and different decl UIDs).  The
> different ordering results in quite a big speedup because bitmaps
> have a more regular form (maybe only for this testcase though).

I have now committed the first two patches to trunk as r196768.

Richard.

2013-03-18  Richard Biener  <rguenther@suse.de>

        PR tree-optimization/39326
        * tree-ssa-loop-im.c (refs_independent_p): Exploit symmetry.
        (struct mem_ref): Replace mem member with ao_ref typed member.
        (MEM_ANALYZABLE): Adjust.
        (memref_eq): Likewise.
        (mem_ref_alloc): Likewise.
        (gather_mem_refs_stmt): Likewise.
        (mem_refs_may_alias_p): Use the ao_ref to query the alias oracle.
        (execute_sm_if_changed_flag_set): Adjust.
        (execute_sm): Likewise.
        (ref_always_accessed_p): Likewise.
        (refs_independent_p): Likewise.
        (can_sm_ref_p): Likewise.

Patch

Index: trunk/gcc/tree-ssa-loop-im.c
===================================================================
--- trunk.orig/gcc/tree-ssa-loop-im.c	2013-03-14 12:36:28.881446232 +0100
+++ trunk/gcc/tree-ssa-loop-im.c	2013-03-14 12:36:49.808682841 +0100
@@ -2255,15 +2255,26 @@  ref_always_accessed_p (struct loop *loop
 static bool
 refs_independent_p (mem_ref_p ref1, mem_ref_p ref2)
 {
-  if (ref1 == ref2
-      || bitmap_bit_p (ref1->indep_ref, ref2->id))
+  if (ref1 == ref2)
     return true;
-  if (bitmap_bit_p (ref1->dep_ref, ref2->id))
-    return false;
+
   if (!MEM_ANALYZABLE (ref1)
       || !MEM_ANALYZABLE (ref2))
     return false;
 
+  /* Reference dependence in a loop is symmetric.  */
+  if (ref1->id > ref2->id)
+    {
+      mem_ref_p tem = ref1;
+      ref1 = ref2;
+      ref2 = tem;
+    }
+
+  if (bitmap_bit_p (ref1->indep_ref, ref2->id))
+    return true;
+  if (bitmap_bit_p (ref1->dep_ref, ref2->id))
+    return false;
+
   if (dump_file && (dump_flags & TDF_DETAILS))
     fprintf (dump_file, "Querying dependency of refs %u and %u: ",
 	     ref1->id, ref2->id);
@@ -2272,7 +2283,6 @@  refs_independent_p (mem_ref_p ref1, mem_
 			    &memory_accesses.ttae_cache))
     {
       bitmap_set_bit (ref1->dep_ref, ref2->id);
-      bitmap_set_bit (ref2->dep_ref, ref1->id);
       if (dump_file && (dump_flags & TDF_DETAILS))
 	fprintf (dump_file, "dependent.\n");
       return false;
@@ -2280,7 +2290,6 @@  refs_independent_p (mem_ref_p ref1, mem_
   else
     {
       bitmap_set_bit (ref1->indep_ref, ref2->id);
-      bitmap_set_bit (ref2->indep_ref, ref1->id);
       if (dump_file && (dump_flags & TDF_DETAILS))
 	fprintf (dump_file, "independent.\n");
       return true;