Patchwork Free decl_in_states when not needed

login
register
mail settings
Submitter Jan Hubicka
Date Aug. 28, 2013, 10:47 a.m.
Message ID <20130828104752.GD8260@kam.mff.cuni.cz>
Download mbox | patch
Permalink /patch/270447/
State New
Headers show

Comments

Jan Hubicka - Aug. 28, 2013, 10:47 a.m.
Hi,
this patch saves almost 1Gb of GGC reachable memory for firefox. 
Currently we point to all streamed in trees from global decl in state and from
function specific states.  The states are big and they dangle pointers.
This patch gets rid of states we do not need.

We run GGC collect just once during usual WPA compilation.  Currently
it is just after tree loading that introduce very little garbage.  This
patch moves it after unreachable code removal when freeing happen.

Still we do not suceed in much of reuse in between GGC and heap, so peak memory
use in TOP reduces by only 300MB.  It may be interesting to allow random access
into per-function decl in states and read them only after decl merging (and
possibly unreachable function removal).

Bootstrapped/regtested x86_64-linux, OK?

	* lto-symtab.c (lto_cgraph_replace_node): Free decl_in_state.
	* cgraph.c (cgraph_release_function_body): Free decl_in_state.
	* lto-section-in.c (lto_free_function_in_decl_state): New function.
	(lto_free_function_in_decl_state_for_node): New function.

	* lto.c (read_cgraph_and_symbols): Remove ggc_collect;
	clear section node; add comment why we do not collect.
Richard Guenther - Aug. 28, 2013, 11:08 a.m.
On Wed, 28 Aug 2013, Jan Hubicka wrote:

> Hi,
> this patch saves almost 1Gb of GGC reachable memory for firefox. 
> Currently we point to all streamed in trees from global decl in state and from
> function specific states.  The states are big and they dangle pointers.
> This patch gets rid of states we do not need.
> 
> We run GGC collect just once during usual WPA compilation.  Currently
> it is just after tree loading that introduce very little garbage.  This
> patch moves it after unreachable code removal when freeing happen.
> 
> Still we do not suceed in much of reuse in between GGC and heap, so peak memory
> use in TOP reduces by only 300MB.  It may be interesting to allow random access
> into per-function decl in states and read them only after decl merging (and
> possibly unreachable function removal).
> 
> Bootstrapped/regtested x86_64-linux, OK?
> 
> 	* lto-symtab.c (lto_cgraph_replace_node): Free decl_in_state.
> 	* cgraph.c (cgraph_release_function_body): Free decl_in_state.
> 	* lto-section-in.c (lto_free_function_in_decl_state): New function.
> 	(lto_free_function_in_decl_state_for_node): New function.
> 
> 	* lto.c (read_cgraph_and_symbols): Remove ggc_collect;
> 	clear section node; add comment why we do not collect.
> Index: lto-symtab.c
> ===================================================================
> --- lto-symtab.c	(revision 202047)
> +++ lto-symtab.c	(working copy)
> @@ -80,6 +80,8 @@ lto_cgraph_replace_node (struct cgraph_n
>    /* Redirect incomming references.  */
>    ipa_clone_referring ((symtab_node)prevailing_node, &node->symbol.ref_list);
>  
> +  lto_free_function_in_decl_state_for_node ((symtab_node)node);
> +
>    if (node->symbol.decl != prevailing_node->symbol.decl)
>      cgraph_release_function_body (node);
>  
> Index: cgraph.c
> ===================================================================
> --- cgraph.c	(revision 202047)
> +++ cgraph.c	(working copy)
> @@ -1663,6 +1663,8 @@ cgraph_release_function_body (struct cgr
>    if (!node->used_as_abstract_origin && DECL_INITIAL (node->symbol.decl))
>      DECL_INITIAL (node->symbol.decl) = error_mark_node;
>    release_function_body (node->symbol.decl);
> +  if (node->symbol.lto_file_data)
> +    lto_free_function_in_decl_state_for_node ((symtab_node) node);
>  }
>  
>  /* Remove the node from cgraph.  */
> Index: lto-section-in.c
> ===================================================================
> --- lto-section-in.c	(revision 202047)
> +++ lto-section-in.c	(working copy)
> @@ -414,6 +414,41 @@ lto_get_function_in_decl_state (struct l
>    return slot? ((struct lto_in_decl_state*) *slot) : NULL;
>  }
>  
> +/* Free decl_states.  */
> +
> +void
> +lto_free_function_in_decl_state (struct lto_in_decl_state *state)
> +{
> +  int i;
> +  for (i = 0; i < LTO_N_DECL_STREAMS; i++)
> +    ggc_free (state->streams[i].trees);

We likely also waste a lot of memory by means of GC overhead here.
Moving this array out of GC space would be better (we know the
exact size upfront) - should be doable with making lto_tree_ref_able
GTY((user)) and a custom marker.

> +  ggc_free (state);
> +}
> +
> +/* Free decl_states associated with NODE.  This makes it possible to furhter
> +   release trees needed by the NODE's body.  */
> +
> +void
> +lto_free_function_in_decl_state_for_node (symtab_node node)
> +{
> +  struct lto_in_decl_state temp;
> +  void **slot;
> +
> +  if (!node->symbol.lto_file_data)
> +    return;
> +
> +  temp.fn_decl = node->symbol.decl;
> +  slot = htab_find_slot (node->symbol.lto_file_data->function_decl_states,
> +			 &temp, NO_INSERT);
> +  if (slot && *slot)
> +    {
> +      lto_free_function_in_decl_state ((struct lto_in_decl_state*) *slot);
> +      htab_clear_slot (node->symbol.lto_file_data->function_decl_states,
> +		       slot);
> +    }
> +  node->symbol.lto_file_data = NULL;
> +}
> +
>  
>  /* Report read pass end of the section.  */
>  
> Index: ipa.c
> ===================================================================
> --- ipa.c	(revision 202047)
> +++ ipa.c	(working copy)
> @@ -873,6 +873,17 @@ function_and_variable_visibility (bool w
>  	       segfault though. */
>  	    symtab_dissolve_same_comdat_group_list ((symtab_node) node);
>  	}
> +      if (node->symbol.externally_visible
> +          && DECL_COMDAT (node->symbol.decl)
> +	  && comdat_can_be_unshared_p ((symtab_node) node))
> +	{
> +	  if (dump_file
> +	      && DECL_VISIBILITY (node->symbol.decl) != VISIBILITY_HIDDEN)
> +	    fprintf (dump_file, "Promoting visibility to hidden: %s/%i\n",
> +		     cgraph_node_name (node), node->symbol.order);
> +	  DECL_VISIBILITY (node->symbol.decl) = VISIBILITY_HIDDEN;
> +	  DECL_VISIBILITY_SPECIFIED (node->symbol.decl) = true;
> +	}
>  
>        if (node->thunk.thunk_p
>  	  && TREE_PUBLIC (node->symbol.decl))
> @@ -980,6 +991,17 @@ function_and_variable_visibility (bool w
>  	    symtab_dissolve_same_comdat_group_list ((symtab_node) vnode);
>  	  vnode->symbol.resolution = LDPR_PREVAILING_DEF_IRONLY;
>  	}
> +      if (vnode->symbol.externally_visible
> +          && DECL_COMDAT (vnode->symbol.decl)
> +	  && comdat_can_be_unshared_p ((symtab_node) vnode))
> +	{
> +	  if (dump_file
> +	      && DECL_VISIBILITY (vnode->symbol.decl) == VISIBILITY_HIDDEN)
> +	    fprintf (dump_file, "Promoting visibility to hidden: %s/%i\n",
> +		     varpool_node_name (vnode), vnode->symbol.order);
> +	  DECL_VISIBILITY (vnode->symbol.decl) = VISIBILITY_HIDDEN;
> +	  DECL_VISIBILITY_SPECIFIED (vnode->symbol.decl) = true;
> +	}
>      }
>  
>    if (dump_file)

Stray changes?

> Index: lto/lto.c
> ===================================================================
> --- lto/lto.c	(revision 202047)
> +++ lto/lto.c	(working copy)
> @@ -3429,7 +3429,6 @@ read_cgraph_and_symbols (unsigned nfiles
>    tree_scc_hash.dispose ();
>    obstack_free (&tree_scc_hash_obstack, NULL);
>    free_gimple_type_tables ();
> -  ggc_collect ();
>
>    /* Set the hooks so that all of the ipa passes can read in their data.  */
>    lto_set_in_hooks (all_file_decl_data, get_section_data, free_section_data);
> @@ -3484,7 +3483,6 @@ read_cgraph_and_symbols (unsigned nfiles
>      }
>    htab_delete (tree_with_vars);
>    tree_with_vars = NULL;
> -  ggc_collect ();

Just leave the ggc_collect ()s there, they are free.
  
>    timevar_pop (TV_IPA_LTO_DECL_MERGE);
>    /* Each pass will set the appropriate timer.  */
> @@ -3503,6 +3501,9 @@ read_cgraph_and_symbols (unsigned nfiles
>        gcc_assert (all_file_decl_data[i]->symtab_node_encoder);
>        lto_symtab_encoder_delete (all_file_decl_data[i]->symtab_node_encoder);
>        all_file_decl_data[i]->symtab_node_encoder = NULL;
> +      lto_free_function_in_decl_state (all_file_decl_data[i]->global_decl_state);
> +      all_file_decl_data[i]->global_decl_state = NULL;
> +      all_file_decl_data[i]->current_decl_state = NULL; 
>      }
>  
>    /* Finally merge the cgraph according to the decl merging decisions.  */
> @@ -3513,7 +3514,12 @@ read_cgraph_and_symbols (unsigned nfiles
>        dump_symtab (cgraph_dump_file);
>      }
>    lto_symtab_merge_symbols ();
> -  ggc_collect ();
> +
> +  /* Do not GGC collect here;  streaming in should not produce garbage.
> +     Be sure we first collect after merging symbols, setting up visibilities
> +     and removing unreachable nodes.  This will happen after whole program
> +     visibility pass.  This should release more memory back to the system
> +     and possibly allow us to re-use it for heap.  */

But it's not wrong to collect here, no?  See above, it should be
mostly free.

Thanks,
Richard.
Jan Hubicka - Aug. 28, 2013, 1:51 p.m.
> > Index: lto-section-in.c
> > ===================================================================
> > --- lto-section-in.c	(revision 202047)
> > +++ lto-section-in.c	(working copy)
> > @@ -414,6 +414,41 @@ lto_get_function_in_decl_state (struct l
> >    return slot? ((struct lto_in_decl_state*) *slot) : NULL;
> >  }
> >  
> > +/* Free decl_states.  */
> > +
> > +void
> > +lto_free_function_in_decl_state (struct lto_in_decl_state *state)
> > +{
> > +  int i;
> > +  for (i = 0; i < LTO_N_DECL_STREAMS; i++)
> > +    ggc_free (state->streams[i].trees);
> 
> We likely also waste a lot of memory by means of GC overhead here.
> Moving this array out of GC space would be better (we know the
> exact size upfront) - should be doable with making lto_tree_ref_able
> GTY((user)) and a custom marker.

Possibly, yes. It should be way how to get this memory back for malloc stuff
without convincing ggc to do something useful ;)

I will be happy to play with this incrementally. (It bit makes me feel bit
worried about maintenance costs when user markes starts to do smart things
everywhere, but this one is important enough special case for sure).
> 
> Stray changes?

Yes, sorry, I was running both tests at one machine.
Removed from the patch now.
> 
> Just leave the ggc_collect ()s there, they are free.

First ggc_collect after type reading executes garbage collector and the other
ggc_collects do nothing (since we never allocate enough of ggc memory to
trigger them).  Currently we trigger ggc_collect just after tree streaming. It
spend time to walk memory, free almost nothing, produce debug output so I know
how much memory we need, and mark the bounds high enough so it ggc_collect
won't happen again. All in all it is pointless for overall memory use of
WPA.

I want the first run to happen only after things are merged & dead code
removed.  Then it frees about 0.5-1GB of garbage that is result of decl state
streams disappearing and some trees actually being rendered dead. This way ggc
allocations of IPA optimizers can use the memory freed.

I can drop this change from the patch removing states now and we can discuss
these incrementally.

Honza
>   
> >    timevar_pop (TV_IPA_LTO_DECL_MERGE);
> >    /* Each pass will set the appropriate timer.  */
> > @@ -3503,6 +3501,9 @@ read_cgraph_and_symbols (unsigned nfiles
> >        gcc_assert (all_file_decl_data[i]->symtab_node_encoder);
> >        lto_symtab_encoder_delete (all_file_decl_data[i]->symtab_node_encoder);
> >        all_file_decl_data[i]->symtab_node_encoder = NULL;
> > +      lto_free_function_in_decl_state (all_file_decl_data[i]->global_decl_state);
> > +      all_file_decl_data[i]->global_decl_state = NULL;
> > +      all_file_decl_data[i]->current_decl_state = NULL; 
> >      }
> >  
> >    /* Finally merge the cgraph according to the decl merging decisions.  */
> > @@ -3513,7 +3514,12 @@ read_cgraph_and_symbols (unsigned nfiles
> >        dump_symtab (cgraph_dump_file);
> >      }
> >    lto_symtab_merge_symbols ();
> > -  ggc_collect ();
> > +
> > +  /* Do not GGC collect here;  streaming in should not produce garbage.
> > +     Be sure we first collect after merging symbols, setting up visibilities
> > +     and removing unreachable nodes.  This will happen after whole program
> > +     visibility pass.  This should release more memory back to the system
> > +     and possibly allow us to re-use it for heap.  */
> 
> But it's not wrong to collect here, no?  See above, it should be
> mostly free.
> 
> Thanks,
> Richard.

Patch

Index: lto-symtab.c
===================================================================
--- lto-symtab.c	(revision 202047)
+++ lto-symtab.c	(working copy)
@@ -80,6 +80,8 @@  lto_cgraph_replace_node (struct cgraph_n
   /* Redirect incomming references.  */
   ipa_clone_referring ((symtab_node)prevailing_node, &node->symbol.ref_list);
 
+  lto_free_function_in_decl_state_for_node ((symtab_node)node);
+
   if (node->symbol.decl != prevailing_node->symbol.decl)
     cgraph_release_function_body (node);
 
Index: cgraph.c
===================================================================
--- cgraph.c	(revision 202047)
+++ cgraph.c	(working copy)
@@ -1663,6 +1663,8 @@  cgraph_release_function_body (struct cgr
   if (!node->used_as_abstract_origin && DECL_INITIAL (node->symbol.decl))
     DECL_INITIAL (node->symbol.decl) = error_mark_node;
   release_function_body (node->symbol.decl);
+  if (node->symbol.lto_file_data)
+    lto_free_function_in_decl_state_for_node ((symtab_node) node);
 }
 
 /* Remove the node from cgraph.  */
Index: lto-section-in.c
===================================================================
--- lto-section-in.c	(revision 202047)
+++ lto-section-in.c	(working copy)
@@ -414,6 +414,41 @@  lto_get_function_in_decl_state (struct l
   return slot? ((struct lto_in_decl_state*) *slot) : NULL;
 }
 
+/* Free decl_states.  */
+
+void
+lto_free_function_in_decl_state (struct lto_in_decl_state *state)
+{
+  int i;
+  for (i = 0; i < LTO_N_DECL_STREAMS; i++)
+    ggc_free (state->streams[i].trees);
+  ggc_free (state);
+}
+
+/* Free decl_states associated with NODE.  This makes it possible to furhter
+   release trees needed by the NODE's body.  */
+
+void
+lto_free_function_in_decl_state_for_node (symtab_node node)
+{
+  struct lto_in_decl_state temp;
+  void **slot;
+
+  if (!node->symbol.lto_file_data)
+    return;
+
+  temp.fn_decl = node->symbol.decl;
+  slot = htab_find_slot (node->symbol.lto_file_data->function_decl_states,
+			 &temp, NO_INSERT);
+  if (slot && *slot)
+    {
+      lto_free_function_in_decl_state ((struct lto_in_decl_state*) *slot);
+      htab_clear_slot (node->symbol.lto_file_data->function_decl_states,
+		       slot);
+    }
+  node->symbol.lto_file_data = NULL;
+}
+
 
 /* Report read pass end of the section.  */
 
Index: ipa.c
===================================================================
--- ipa.c	(revision 202047)
+++ ipa.c	(working copy)
@@ -873,6 +873,17 @@  function_and_variable_visibility (bool w
 	       segfault though. */
 	    symtab_dissolve_same_comdat_group_list ((symtab_node) node);
 	}
+      if (node->symbol.externally_visible
+          && DECL_COMDAT (node->symbol.decl)
+	  && comdat_can_be_unshared_p ((symtab_node) node))
+	{
+	  if (dump_file
+	      && DECL_VISIBILITY (node->symbol.decl) != VISIBILITY_HIDDEN)
+	    fprintf (dump_file, "Promoting visibility to hidden: %s/%i\n",
+		     cgraph_node_name (node), node->symbol.order);
+	  DECL_VISIBILITY (node->symbol.decl) = VISIBILITY_HIDDEN;
+	  DECL_VISIBILITY_SPECIFIED (node->symbol.decl) = true;
+	}
 
       if (node->thunk.thunk_p
 	  && TREE_PUBLIC (node->symbol.decl))
@@ -980,6 +991,17 @@  function_and_variable_visibility (bool w
 	    symtab_dissolve_same_comdat_group_list ((symtab_node) vnode);
 	  vnode->symbol.resolution = LDPR_PREVAILING_DEF_IRONLY;
 	}
+      if (vnode->symbol.externally_visible
+          && DECL_COMDAT (vnode->symbol.decl)
+	  && comdat_can_be_unshared_p ((symtab_node) vnode))
+	{
+	  if (dump_file
+	      && DECL_VISIBILITY (vnode->symbol.decl) == VISIBILITY_HIDDEN)
+	    fprintf (dump_file, "Promoting visibility to hidden: %s/%i\n",
+		     varpool_node_name (vnode), vnode->symbol.order);
+	  DECL_VISIBILITY (vnode->symbol.decl) = VISIBILITY_HIDDEN;
+	  DECL_VISIBILITY_SPECIFIED (vnode->symbol.decl) = true;
+	}
     }
 
   if (dump_file)
Index: lto/lto.c
===================================================================
--- lto/lto.c	(revision 202047)
+++ lto/lto.c	(working copy)
@@ -3429,7 +3429,6 @@  read_cgraph_and_symbols (unsigned nfiles
   tree_scc_hash.dispose ();
   obstack_free (&tree_scc_hash_obstack, NULL);
   free_gimple_type_tables ();
-  ggc_collect ();
 
   /* Set the hooks so that all of the ipa passes can read in their data.  */
   lto_set_in_hooks (all_file_decl_data, get_section_data, free_section_data);
@@ -3484,7 +3483,6 @@  read_cgraph_and_symbols (unsigned nfiles
     }
   htab_delete (tree_with_vars);
   tree_with_vars = NULL;
-  ggc_collect ();
 
   timevar_pop (TV_IPA_LTO_DECL_MERGE);
   /* Each pass will set the appropriate timer.  */
@@ -3503,6 +3501,9 @@  read_cgraph_and_symbols (unsigned nfiles
       gcc_assert (all_file_decl_data[i]->symtab_node_encoder);
       lto_symtab_encoder_delete (all_file_decl_data[i]->symtab_node_encoder);
       all_file_decl_data[i]->symtab_node_encoder = NULL;
+      lto_free_function_in_decl_state (all_file_decl_data[i]->global_decl_state);
+      all_file_decl_data[i]->global_decl_state = NULL;
+      all_file_decl_data[i]->current_decl_state = NULL; 
     }
 
   /* Finally merge the cgraph according to the decl merging decisions.  */
@@ -3513,7 +3514,12 @@  read_cgraph_and_symbols (unsigned nfiles
       dump_symtab (cgraph_dump_file);
     }
   lto_symtab_merge_symbols ();
-  ggc_collect ();
+
+  /* Do not GGC collect here;  streaming in should not produce garbage.
+     Be sure we first collect after merging symbols, setting up visibilities
+     and removing unreachable nodes.  This will happen after whole program
+     visibility pass.  This should release more memory back to the system
+     and possibly allow us to re-use it for heap.  */
   cgraph_state = CGRAPH_STATE_IPA_SSA;
 
   timevar_pop (TV_IPA_LTO_CGRAPH_MERGE);
Index: lto-streamer.h
===================================================================
--- lto-streamer.h	(revision 202047)
+++ lto-streamer.h	(working copy)
@@ -774,6 +774,8 @@  extern hashval_t lto_hash_in_decl_state
 extern int lto_eq_in_decl_state (const void *, const void *);
 extern struct lto_in_decl_state *lto_get_function_in_decl_state (
 				      struct lto_file_decl_data *, tree);
+extern void lto_free_function_in_decl_state (struct lto_in_decl_state *);
+extern void lto_free_function_in_decl_state_for_node (symtab_node);
 extern void lto_section_overrun (struct lto_input_block *) ATTRIBUTE_NORETURN;
 extern void lto_value_range_error (const char *,
 				   HOST_WIDE_INT, HOST_WIDE_INT,