diff mbox

[2/2] Fix expansion slowness of PR60291

Message ID alpine.LSU.2.11.1402211420420.27942@zhemvz.fhfr.qr
State New
Headers show

Commit Message

Richard Biener Feb. 21, 2014, 1:21 p.m. UTC
On Fri, 21 Feb 2014, Richard Biener wrote:

> On Fri, 21 Feb 2014, Richard Sandiford wrote:
> 
> > Richard Biener <rguenther@suse.de> writes:
> > > On Fri, 21 Feb 2014, Richard Biener wrote:
> > >
> > >> On Fri, 21 Feb 2014, Richard Biener wrote:
> > >> 
> > >> > On Fri, 21 Feb 2014, Richard Biener wrote:
> > >> > 
> > >> > > 
> > >> > > This fixes the slowness of RTL expansion in PR60291 which is caused
> > >> > > by excessive collisions in mem-attr sharing.  The issue here is
> > >> > > that sharing attempts happens across functions and we have a _lot_
> > >> > > of functions in this testcase referencing the same lexically
> > >> > > equivalent memory, for example MEM[(StgWord *)_5 + -64B].  That
> > >> > > means those get the same hash value.  But they don't compare
> > >> > > equal because an SSA name _5 from function A is of course not equal
> > >> > > to one from function B.
> > >> > > 
> > >> > > The following fixes that by not doing mem-attr sharing across functions
> > >> > > by clearing the mem-attrs hashtable in rest_of_clean_state.
> > >> > > 
> > >> > > Another fix may be to do what the comment in iterative_hash_expr
> > >> > > says for SSA names:
> > >> > > 
> > >> > >     case SSA_NAME:
> > >> > >       /* We can just compare by pointer.  */
> > >> > >       return iterative_hash_host_wide_int (SSA_NAME_VERSION (t), val);
> > >> > > 
> > >> > > (probably blame me for changing that to hashing the SSA version).
> > >> > 
> > >> > It was lxo.
> > >> > 
> > >> > > But I'm not sure that doesn't uncover issues with various hashtables and
> > >> > > walking them, generating code dependent on the order.  It's IMHO just not
> > >> > > expected that you compare function-local expressions from different
> > >> > > functions.
> > >> > 
> > >> > Same speedup result from
> > >> > 
> > >> > Index: gcc/tree.c
> > >> > ===================================================================
> > >> > --- gcc/tree.c  (revision 207960)
> > >> > +++ gcc/tree.c  (working copy)
> > >> > @@ -7428,7 +7428,7 @@ iterative_hash_expr (const_tree t, hashv
> > >> >        }
> > >> >      case SSA_NAME:
> > >> >        /* We can just compare by pointer.  */
> > >> > -      return iterative_hash_host_wide_int (SSA_NAME_VERSION (t), val);
> > >> > +      return iterative_hash_hashval_t ((uintptr_t)t>>3, val);
> > >> >      case PLACEHOLDER_EXPR:
> > >> >        /* The node itself doesn't matter.  */
> > >> >        return val;
> > >> > 
> > >> > and from
> > >> > 
> > >> > Index: gcc/tree.c
> > >> > ===================================================================
> > >> > --- gcc/tree.c  (revision 207960)
> > >> > +++ gcc/tree.c  (working copy)
> > >> > @@ -7428,7 +7428,9 @@ iterative_hash_expr (const_tree t, hashv
> > >> >        }
> > >> >      case SSA_NAME:
> > >> >        /* We can just compare by pointer.  */
> > >> > -      return iterative_hash_host_wide_int (SSA_NAME_VERSION (t), val);
> > >> > +      return iterative_hash_host_wide_int
> > >> > +             (DECL_UID (cfun->decl),
> > >> > +              iterative_hash_host_wide_int (SSA_NAME_VERSION (t), val));
> > >> >      case PLACEHOLDER_EXPR:
> > >> >        /* The node itself doesn't matter.  */
> > >> >        return val;
> > >> > 
> > >> > better than hashing pointers but requring cfun != NULL in this
> > >> > function isn't good either.
> > >> > 
> > >> > At this point I'm more comfortable with clearing the hashtable
> > >> > than with changing iterative_hash_expr in any way.  It's also
> > >> > along the way to get rid of the hash completely.
> > >> > 
> > >> > Oh, btw, the speedup is going from
> > >> > 
> > >> >  expand                  : 481.98 (94%) usr   1.15 (17%) sys 481.94 (93%) 
> > >> > wall  293891 kB (15%) ggc
> > >> > 
> > >> > to
> > >> > 
> > >> >  expand                  :   2.66 ( 7%) usr   0.13 ( 2%) sys   2.64 ( 6%) 
> > >> > wall  262544 kB (13%) ggc
> > >> > 
> > >> > at -O0 (less dramatic slowness for -On).
> > >> > 
> > >> > > The other thing would be to discard mem-attr sharing alltogether,
> > >> > > but that doesn't seem appropriate at this stage (but it would
> > >> > > also simplify quite some code).  With only one function in RTL
> > >> > > at a time that shouldn't be too bad (see several suggestions
> > >> > > along that line, even with statistics).
> > >> 
> > >> Last statistics: http://gcc.gnu.org/ml/gcc-patches/2011-08/msg01784.html
> > >
> > > With the patch below to get some statistics we see that one important
> > > piece of sharing not covered by above measurements is RTX copying(?).
> > >
> > > On the testcase for this PR I get at -O1 and without the patch
> > > to clear the hashtable after each function
> > >
> > > 142489 mem_attrs created (142439 for new, 50 for modification)
> > > 1983225 mem_attrs shared (4044 for new, 820241 for modification, 1158940 
> > > by rtx copying)
> > > 0 mem_attrs dropped
> > >
> > > and with the patch to clear after each function
> > >
> > > 364411 mem_attrs created (144478 for new, 219933 for modification)
> > > 1761303 mem_attrs shared (2005 for new, 600358 for modification, 1158940 
> > > by rtx copying)
> > > 0 mem_attrs dropped
> > >
> > > while for dwarf2out.c I see without the clearing
> > >
> > > 24399 mem_attrs created (6929 for new, 17470 for modification)
> > > 102676 mem_attrs shared (10878 for new, 29265 for modification, 62533 by 
> > > rtx copying)
> > > 16 mem_attrs dropped
> > >
> > > which means that completely dropping the sharing would result
> > > in creating of 6929 + 17807 + 62533(!) vs. 24399 mem-attrs.
> > > That's still not a lot overhead given that mem-attrs take 40 bytes
> > > (3MB vs. 950kB).  There is also always the possibility to
> > > explicitely ref-count mem-attrs to handle sharing by rtx
> > > copying (at least cse, fwprop, combine, ira and reload copy MEMs,
> > > probably some for no good reason because MEMs are not shared),
> > > thus make mem-attrs unshare-on-modify.
> > 
> > In a thread a few years ago you talked about the possibility of going
> > further and folding the attributes into the MEM itself, so avoiding
> > the indirection and separate allocation:
> > 
> >   http://thread.gmane.org/gmane.comp.gcc.patches/244464/focus=244538
> > 
> > (and earlier posts in the thread).  Would that still be OK?
> > I might have a go if so.
> 
> It would work for me.  Micha just brought up the easiest incremental
> change though, which is
> 
> Index: gcc/emit-rtl.c
> ===================================================================
> --- gcc/emit-rtl.c      (revision 207960)
> +++ gcc/emit-rtl.c      (working copy)
> @@ -304,14 +304,12 @@ set_mem_attrs (rtx mem, mem_attrs *attrs
>        return;
>      }
>  
> -  slot = htab_find_slot (mem_attrs_htab, attrs, INSERT);
> -  if (*slot == 0)
> +  if (!MEM_ATTRS (mem)
> +      || memcmp (MEM_ATTRS (mem), attrs, sizeof (mem_attrs)) != 0)
>      {
> -      *slot = ggc_alloc_mem_attrs ();
> -      memcpy (*slot, attrs, sizeof (mem_attrs));
> +      MEM_ATTRS (mem) = ggc_alloc_mem_attrs ();
> +      memcpy (MEM_ATTRS (mem), attrs, sizeof (mem_attrs));
>      }
> -
> -  MEM_ATTRS (mem) = (mem_attrs *) *slot;
>  }
>  
>  /* Returns a hash code for X (which is a really a reg_attrs *).  */
> 
> which would be equivalent to folding it into the MEM but 1) not
> saving the pointer, 2) retaining the important copy_rtx sharing.

I am testing the following (and also consider it appropriate as a
fix for the regression PR60291).

Ok for trunk/branch(es)?  Now we have many variants to choose from ;)

Thanks,
Richard.

2014-02-21  Richard Biener  <rguenther@suse.de>

	PR middle-end/60291
	* emit-rtl.c (mem_attrs_htab): Remove.
	(mem_attrs_htab_hash): Likewise.
	(mem_attrs_htab_eq): Likewise.
	(set_mem_attrs): Always allocate new mem-attrs when something
	changed.
	(init_emit_once): Do not allocate mem_attrs_htab.

Comments

Richard Biener Feb. 21, 2014, 2:20 p.m. UTC | #1
On Fri, 21 Feb 2014, Richard Biener wrote:

> On Fri, 21 Feb 2014, Richard Biener wrote:
> 
> > On Fri, 21 Feb 2014, Richard Sandiford wrote:
> > 
> > > In a thread a few years ago you talked about the possibility of going
> > > further and folding the attributes into the MEM itself, so avoiding
> > > the indirection and separate allocation:
> > > 
> > >   http://thread.gmane.org/gmane.comp.gcc.patches/244464/focus=244538
> > > 
> > > (and earlier posts in the thread).  Would that still be OK?
> > > I might have a go if so.
> > 
> > It would work for me.  Micha just brought up the easiest incremental
> > change though, which is

...

> I am testing the following (and also consider it appropriate as a
> fix for the regression PR60291).
> 
> Ok for trunk/branch(es)?  Now we have many variants to choose from ;)

Jakub requested statistics for a bootstrap for this one.  I get
for r207939 and a --enable-languages=c x86_64 bootstrap
3609924 mem-attrs created overall without the patch and
8268976 with the patch (that's a factor of 2.3 and thus "nothing").

Richard.
Richard Biener Feb. 24, 2014, 5:28 p.m. UTC | #2
On Fri, 21 Feb 2014, Richard Biener wrote:

> On Fri, 21 Feb 2014, Richard Biener wrote:
> 
> > On Fri, 21 Feb 2014, Richard Biener wrote:
> > 
> > > On Fri, 21 Feb 2014, Richard Sandiford wrote:
> > > 
> > > > In a thread a few years ago you talked about the possibility of going
> > > > further and folding the attributes into the MEM itself, so avoiding
> > > > the indirection and separate allocation:
> > > > 
> > > >   http://thread.gmane.org/gmane.comp.gcc.patches/244464/focus=244538
> > > > 
> > > > (and earlier posts in the thread).  Would that still be OK?
> > > > I might have a go if so.
> > > 
> > > It would work for me.  Micha just brought up the easiest incremental
> > > change though, which is
> 
> ...
> 
> > I am testing the following (and also consider it appropriate as a
> > fix for the regression PR60291).
> > 
> > Ok for trunk/branch(es)?  Now we have many variants to choose from ;)
> 
> Jakub requested statistics for a bootstrap for this one.  I get
> for r207939 and a --enable-languages=c x86_64 bootstrap
> 3609924 mem-attrs created overall without the patch and
> 8268976 with the patch (that's a factor of 2.3 and thus "nothing").

One more piece of that statistic, peak number-of-mem-attrs are
121985 without and 298399 with the patch (thats overall number
of GC allocations of a mem-attr for the worst compilation unit
during bootstrap).  That's 4.7MB vs. 11.6MB _aggregate_ memory
use (not translatable to the peak number of mem-attrs live, but
it certainly bounds it).

As it seems that nobody is objecting this patch I'll go ahead
and check it in tomorrow.

Thanks,
Richard.
diff mbox

Patch

Index: gcc/emit-rtl.c
===================================================================
--- gcc/emit-rtl.c	(revision 207960)
+++ gcc/emit-rtl.c	(working copy)
@@ -126,10 +126,6 @@  rtx cc0_rtx;
 static GTY ((if_marked ("ggc_marked_p"), param_is (struct rtx_def)))
      htab_t const_int_htab;
 
-/* A hash table storing memory attribute structures.  */
-static GTY ((if_marked ("ggc_marked_p"), param_is (struct mem_attrs)))
-     htab_t mem_attrs_htab;
-
 /* A hash table storing register attribute structures.  */
 static GTY ((if_marked ("ggc_marked_p"), param_is (struct reg_attrs)))
      htab_t reg_attrs_htab;
@@ -157,8 +153,6 @@  static rtx lookup_const_double (rtx);
 static hashval_t const_fixed_htab_hash (const void *);
 static int const_fixed_htab_eq (const void *, const void *);
 static rtx lookup_const_fixed (rtx);
-static hashval_t mem_attrs_htab_hash (const void *);
-static int mem_attrs_htab_eq (const void *, const void *);
 static hashval_t reg_attrs_htab_hash (const void *);
 static int reg_attrs_htab_eq (const void *, const void *);
 static reg_attrs *get_reg_attrs (tree, int);
@@ -249,20 +243,6 @@  const_fixed_htab_eq (const void *x, cons
   return fixed_identical (CONST_FIXED_VALUE (a), CONST_FIXED_VALUE (b));
 }
 
-/* Returns a hash code for X (which is a really a mem_attrs *).  */
-
-static hashval_t
-mem_attrs_htab_hash (const void *x)
-{
-  const mem_attrs *const p = (const mem_attrs *) x;
-
-  return (p->alias ^ (p->align * 1000)
-	  ^ (p->addrspace * 4000)
-	  ^ ((p->offset_known_p ? p->offset : 0) * 50000)
-	  ^ ((p->size_known_p ? p->size : 0) * 2500000)
-	  ^ (size_t) iterative_hash_expr (p->expr, 0));
-}
-
 /* Return true if the given memory attributes are equal.  */
 
 static bool
@@ -280,23 +260,11 @@  mem_attrs_eq_p (const struct mem_attrs *
 		  && operand_equal_p (p->expr, q->expr, 0))));
 }
 
-/* Returns nonzero if the value represented by X (which is really a
-   mem_attrs *) is the same as that given by Y (which is also really a
-   mem_attrs *).  */
-
-static int
-mem_attrs_htab_eq (const void *x, const void *y)
-{
-  return mem_attrs_eq_p ((const mem_attrs *) x, (const mem_attrs *) y);
-}
-
 /* Set MEM's memory attributes so that they are the same as ATTRS.  */
 
 static void
 set_mem_attrs (rtx mem, mem_attrs *attrs)
 {
-  void **slot;
-
   /* If everything is the default, we can just clear the attributes.  */
   if (mem_attrs_eq_p (attrs, mode_mem_attrs[(int) GET_MODE (mem)]))
     {
@@ -304,14 +272,12 @@  set_mem_attrs (rtx mem, mem_attrs *attrs
       return;
     }
 
-  slot = htab_find_slot (mem_attrs_htab, attrs, INSERT);
-  if (*slot == 0)
+  if (!MEM_ATTRS (mem)
+      || !mem_attrs_eq_p (attrs, MEM_ATTRS (mem)))
     {
-      *slot = ggc_alloc_mem_attrs ();
-      memcpy (*slot, attrs, sizeof (mem_attrs));
+      MEM_ATTRS (mem) = ggc_alloc_mem_attrs ();
+      memcpy (MEM_ATTRS (mem), attrs, sizeof (mem_attrs));
     }
-
-  MEM_ATTRS (mem) = (mem_attrs *) *slot;
 }
 
 /* Returns a hash code for X (which is a really a reg_attrs *).  */
@@ -5662,8 +5628,6 @@  init_emit_once (void)
   const_fixed_htab = htab_create_ggc (37, const_fixed_htab_hash,
 				      const_fixed_htab_eq, NULL);
 
-  mem_attrs_htab = htab_create_ggc (37, mem_attrs_htab_hash,
-				    mem_attrs_htab_eq, NULL);
   reg_attrs_htab = htab_create_ggc (37, reg_attrs_htab_hash,
 				    reg_attrs_htab_eq, NULL);