diff mbox

[var-tracking] small speed-ups

Message ID alpine.LNX.2.02.1108221312590.1374@localhost.localdomain
State New
Headers show

Commit Message

Dimitrios Apostolou Aug. 22, 2011, 10:30 a.m. UTC
Hello list,

this patch has all of my changes to var-tracking that I believe are worth 
keeping. These are all minor changes not touching algorithmic issues, I 
lost much time trying to understand what is actually done in 
var-tracking.c but I still don't get it. I wish there was some document 
describing current implementation. I'd appreciate all help I can get here.

Bootstrapped/tested on x86_64.

Thanks to lxo for helping me, hopefully his plan for better var-tracking 
throughout all optimisation passes will be implemented. This is the only 
var-tracking related doc I could find... ( 
http://gcc.gnu.org/wiki/Var_Tracking_Assignments).

This patch also includes minor stylistic changes that made the code just a 
tiny bit more accessible to me, the indirection was so much that it hardly 
reminded me of C, let me know if I should remove these parts. :-s For the 
sake of completion I'll also post a follow-up patch where I 
delete/simplify a big part of var-tracking, unfortunately with some impact 
on performance.


2011-08-22  Dimitrios Apostolou  <jimis@gmx.net>

 	* var-tracking.c (init_attrs_list_set): Remove function, instead
 	use a memset() call to zero the attr list in...
 	(dataflow_set_init).
 	(vars_copy): Remove function because inserting each element into a
 	new hash table was too costly. Replaced with the ...
 	(htab_dup): ... new function that only does a memcpy() of the
 	element table in htab_t, without rehashing any elements.
 	(shared_hash_unshare): Replace the vars_copy() call with
 	htab_dup(), plus do a little extra work (reference counting) which
 	was in vars_copy.
 	(shared_hash_destroy, dataflow_set_destroy): Add an extra
 	"do_free" bool argument, to avoid iterating over hash tables
 	freeing elements, when not needed.
 	(vt_finalize, vt_emit_notes): Call the above with do_free=false
 	since all pools will be freed later.
 	(dataflow_set_clear, dataflow_set_copy, dataflow_set_union)
 	(dataflow_set_merge, dataflow_set_different, compute_bb_dataflow)
 	(vt_find_locations): Call shared_hash_destroy with do_free=true.
 	(attrs_list_copy): Do not free destination list but reuse already
 	allocated elements if possible.

Comments

Jakub Jelinek Aug. 22, 2011, 12:05 p.m. UTC | #1
On Mon, Aug 22, 2011 at 01:30:33PM +0300, Dimitrios Apostolou wrote:
> --- gcc/var-tracking.c	2011-06-03 01:42:31 +0000
> +++ gcc/var-tracking.c	2011-08-22 09:52:08 +0000
> @@ -1069,14 +1067,14 @@ adjust_insn (basic_block bb, rtx insn)
>  static inline bool
>  dv_is_decl_p (decl_or_value dv)
>  {
> -  return !dv || (int) TREE_CODE ((tree) dv) != (int) VALUE;
> +  return !dv || ((int) TREE_CODE ((tree) dv) != (int) VALUE);
>  }

Why?

>  /* Return true if a decl_or_value is a VALUE rtl.  */
>  static inline bool
>  dv_is_value_p (decl_or_value dv)
>  {
> -  return dv && !dv_is_decl_p (dv);
> +  return !dv_is_decl_p (dv);
>  }

This is fine, though I don't think it makes any difference if var-tracking
is compiled with -O+ - gcc should be able to optimize the second
NULL/non-NULL check out.

> @@ -1191,7 +1189,7 @@ dv_uid2hash (dvuid uid)
>  static inline hashval_t
>  dv_htab_hash (decl_or_value dv)
>  {
> -  return dv_uid2hash (dv_uid (dv));
> +  return (hashval_t) (dv_uid (dv));
>  }

Why?  dv_uid2hash is an inline that does exactly that.

> @@ -1202,7 +1200,7 @@ variable_htab_hash (const void *x)
>  {
>    const_variable const v = (const_variable) x;
>  
> -  return dv_htab_hash (v->dv);
> +  return (hashval_t) (dv_uid (v->dv));
>  }

Why?

>  /* Compare the declaration of variable X with declaration Y.  */
> @@ -1211,9 +1209,8 @@ static int
>  variable_htab_eq (const void *x, const void *y)
>  {
>    const_variable const v = (const_variable) x;
> -  decl_or_value dv = CONST_CAST2 (decl_or_value, const void *, y);
>  
> -  return (dv_as_opaque (v->dv) == dv_as_opaque (dv));
> +  return (v->dv) == y;
>  }

Why?

> @@ -1397,19 +1398,40 @@ shared_var_p (variable var, shared_hash 
>  	  || shared_hash_shared (vars));
>  }
>  
> +/* Copy all variables from hash table SRC to hash table DST without rehashing
> +   any values.  */
> +
> +static htab_t
> +htab_dup (htab_t src)
> +{
> +  htab_t dst;
> +
> +  dst = (htab_t) xmalloc (sizeof (*src));
> +  memcpy (dst, src, sizeof (*src));
> +  dst->entries = (void **) xmalloc (src->size * sizeof (*src->entries));
> +  memcpy (dst->entries, src->entries,
> +	  src->size * sizeof (*src->entries));
> +  return dst;
> +}
> +

This certainly doesn't belong here, it should go into libiberty/hashtab.c
and prototype into include/hashtab.h.  It relies on hashtab.c
implementation details.

> @@ -2034,7 +2041,8 @@ val_resolve (dataflow_set *set, rtx val,
>  static void
>  dataflow_set_init (dataflow_set *set)
>  {
> -  init_attrs_list_set (set->regs);
> +  /* Initialize the set (array) SET of attrs to empty lists.  */
> +  memset (set->regs, 0, sizeof (set->regs));
>    set->vars = shared_hash_copy (empty_shared_hash);
>    set->stack_adjust = 0;
>    set->traversed_vars = NULL;

I'd say you should instead just implement init_attrs_list_set inline using
memset.

>    dst->vars = (shared_hash) pool_alloc (shared_hash_pool);
>    dst->vars->refcount = 1;
>    dst->vars->htab
> -    = htab_create (MAX (src1_elems, src2_elems), variable_htab_hash,
> +    = htab_create (2 * MAX (src1_elems, src2_elems), variable_htab_hash,
>  		   variable_htab_eq, variable_htab_free);

This looks wrong, 2 * max is definitely too much.

> @@ -8996,11 +9006,13 @@ vt_finalize (void)
>  
>    FOR_ALL_BB (bb)
>      {
> -      dataflow_set_destroy (&VTI (bb)->in);
> -      dataflow_set_destroy (&VTI (bb)->out);
> +      /* The "false" do_free parameter means to not bother to iterate and free
> +	 all hash table elements, since we'll destroy the pools. */
> +      dataflow_set_destroy (&VTI (bb)->in, false);
> +      dataflow_set_destroy (&VTI (bb)->out, false);
>        if (VTI (bb)->permp)
>  	{
> -	  dataflow_set_destroy (VTI (bb)->permp);
> +	  dataflow_set_destroy (VTI (bb)->permp, false);
>  	  XDELETE (VTI (bb)->permp);
>  	}
>      }
> 

How much does this actually speed things up (the not freeing pool allocated
stuff during finalizaqtion)?  Is it really worth it?

	Jakub
Dimitrios Apostolou Aug. 23, 2011, 11:40 a.m. UTC | #2
Hi jakub,

On Mon, 22 Aug 2011, Jakub Jelinek wrote:
> On Mon, Aug 22, 2011 at 01:30:33PM +0300, Dimitrios Apostolou wrote:
>
>> @@ -1191,7 +1189,7 @@ dv_uid2hash (dvuid uid)
>>  static inline hashval_t
>>  dv_htab_hash (decl_or_value dv)
>>  {
>> -  return dv_uid2hash (dv_uid (dv));
>> +  return (hashval_t) (dv_uid (dv));
>>  }
>
> Why?  dv_uid2hash is an inline that does exactly that.
>
>> @@ -1202,7 +1200,7 @@ variable_htab_hash (const void *x)
>>  {
>>    const_variable const v = (const_variable) x;
>>
>> -  return dv_htab_hash (v->dv);
>> +  return (hashval_t) (dv_uid (v->dv));
>>  }
>
> Why?
>
>>  /* Compare the declaration of variable X with declaration Y.  */
>> @@ -1211,9 +1209,8 @@ static int
>>  variable_htab_eq (const void *x, const void *y)
>>  {
>>    const_variable const v = (const_variable) x;
>> -  decl_or_value dv = CONST_CAST2 (decl_or_value, const void *, y);
>>
>> -  return (dv_as_opaque (v->dv) == dv_as_opaque (dv));
>> +  return (v->dv) == y;
>>  }
>
> Why?

I was hoping you'd ask so I'll ask back :-) Why are we doing it that way? 
Why so much indirection in the first place? Why create inline functions 
just to typecast and why do we need this CONST_CAST2 ugliness in C code. I 
bet there are things I don't understand so I'd be happy to listen...

The reason I did this (and many more I didn't publish) simplifications 
within var-tracking is because it hurt my brains to follow the 
logic. Even with the help of TAGS I have a specific stack depth before I 
forget where I begun diving into TAG declarations. Well in var-tracking 
this limit was surpassed by much...

>
>> @@ -1397,19 +1398,40 @@ shared_var_p (variable var, shared_hash
>>  	  || shared_hash_shared (vars));
>>  }
>>
>> +/* Copy all variables from hash table SRC to hash table DST without rehashing
>> +   any values.  */
>> +
>> +static htab_t
>> +htab_dup (htab_t src)
>> +{
>> +  htab_t dst;
>> +
>> +  dst = (htab_t) xmalloc (sizeof (*src));
>> +  memcpy (dst, src, sizeof (*src));
>> +  dst->entries = (void **) xmalloc (src->size * sizeof (*src->entries));
>> +  memcpy (dst->entries, src->entries,
>> +	  src->size * sizeof (*src->entries));
>> +  return dst;
>> +}
>> +
>
> This certainly doesn't belong here, it should go into libiberty/hashtab.c
> and prototype into include/hashtab.h.  It relies on hashtab.c
> implementation details.

OK I'll do that in the future. Should I also move some other htab 
functions I saw in var-tracking and rtl? FOR_EACH_HTAB_ELEMENT comes to 
mind, probably other too.

>
>> @@ -2034,7 +2041,8 @@ val_resolve (dataflow_set *set, rtx val,
>>  static void
>>  dataflow_set_init (dataflow_set *set)
>>  {
>> -  init_attrs_list_set (set->regs);
>> +  /* Initialize the set (array) SET of attrs to empty lists.  */
>> +  memset (set->regs, 0, sizeof (set->regs));
>>    set->vars = shared_hash_copy (empty_shared_hash);
>>    set->stack_adjust = 0;
>>    set->traversed_vars = NULL;
>
> I'd say you should instead just implement init_attrs_list_set inline using
> memset.

It's used only once, that's why I deleted the function. I'll bring it back 
if you think it helps.

>
>>    dst->vars = (shared_hash) pool_alloc (shared_hash_pool);
>>    dst->vars->refcount = 1;
>>    dst->vars->htab
>> -    = htab_create (MAX (src1_elems, src2_elems), variable_htab_hash,
>> +    = htab_create (2 * MAX (src1_elems, src2_elems), variable_htab_hash,
>>  		   variable_htab_eq, variable_htab_free);
>
> This looks wrong, 2 * max is definitely too much.

For a hash table to fit N elements, it has to have at least 4/3*N 
slots, or 2*N slots if htab has the 50% load factor I was proposing.

>> @@ -8996,11 +9006,13 @@ vt_finalize (void)
>>
>>    FOR_ALL_BB (bb)
>>      {
>> -      dataflow_set_destroy (&VTI (bb)->in);
>> -      dataflow_set_destroy (&VTI (bb)->out);
>> +      /* The "false" do_free parameter means to not bother to iterate and free
>> +	 all hash table elements, since we'll destroy the pools. */
>> +      dataflow_set_destroy (&VTI (bb)->in, false);
>> +      dataflow_set_destroy (&VTI (bb)->out, false);
>>        if (VTI (bb)->permp)
>>  	{
>> -	  dataflow_set_destroy (VTI (bb)->permp);
>> +	  dataflow_set_destroy (VTI (bb)->permp, false);
>>  	  XDELETE (VTI (bb)->permp);
>>  	}
>>      }
>>
>
> How much does this actually speed things up (the not freeing pool allocated
> stuff during finalizaqtion)?  Is it really worth it?

In total for dataflow_set_destroy I can see that calls to 
attrs_list_clear() have been reduced from 500K to 250K, and I can also see 
a reduction of free() calls from htab_delete(), from 30K to 10K. I'm 
willing to bet that much of this is because of this change, I have kept 
only the ones that showed difference and remember clearly that 
var-tracking is iterating over hash tables too much, either directly or 
from htab_traverse()/htab_delete().


Thanks,
Dimitris
Jakub Jelinek Aug. 23, 2011, 1:43 p.m. UTC | #3
On Tue, Aug 23, 2011 at 02:40:56PM +0300, Dimitrios Apostolou wrote:
> I was hoping you'd ask so I'll ask back :-) Why are we doing it that
> way? Why so much indirection in the first place? Why create inline
> functions just to typecast and why do we need this CONST_CAST2
> ugliness in C code. I bet there are things I don't understand so I'd
> be happy to listen...

It is not indirection, it is abstraction, which should make the code more
readable and allow changing the implementation details.

> OK I'll do that in the future. Should I also move some other htab
> functions I saw in var-tracking and rtl? FOR_EACH_HTAB_ELEMENT comes
> to mind, probably other too.

I guess FOR_EACH_HTAB_ELEMENT could move too (together with all the support
though - tree-flow.h and tree-flow-inline.h related stuff).

> It's used only once, that's why I deleted the function. I'll bring
> it back if you think it helps.

Yes, please.

> >>   dst->vars = (shared_hash) pool_alloc (shared_hash_pool);
> >>   dst->vars->refcount = 1;
> >>   dst->vars->htab
> >>-    = htab_create (MAX (src1_elems, src2_elems), variable_htab_hash,
> >>+    = htab_create (2 * MAX (src1_elems, src2_elems), variable_htab_hash,
> >> 		   variable_htab_eq, variable_htab_free);
> >
> >This looks wrong, 2 * max is definitely too much.
> 
> For a hash table to fit N elements, it has to have at least 4/3*N
> slots, or 2*N slots if htab has the 50% load factor I was proposing.

For var-tracking, 50% load factor is IMHO a very bad idea, memory
consumption of var-tracking is already high right now, we IMHO don't have
the luxury to waste more RAM.

> In total for dataflow_set_destroy I can see that calls to
> attrs_list_clear() have been reduced from 500K to 250K, and I can
> also see a reduction of free() calls from htab_delete(), from 30K to

free calls?  If you avoid free calls, then you end up with a memory leak.
I can understand when you avoid pool_free calls...

	Jakub
Dimitrios Apostolou Aug. 23, 2011, 2:37 p.m. UTC | #4
On Tue, 23 Aug 2011, Jakub Jelinek wrote:
> On Tue, Aug 23, 2011 at 02:40:56PM +0300, Dimitrios Apostolou wrote:
>
>>>>   dst->vars = (shared_hash) pool_alloc (shared_hash_pool);
>>>>   dst->vars->refcount = 1;
>>>>   dst->vars->htab
>>>> -    = htab_create (MAX (src1_elems, src2_elems), variable_htab_hash,
>>>> +    = htab_create (2 * MAX (src1_elems, src2_elems), variable_htab_hash,
>>>> 		   variable_htab_eq, variable_htab_free);
>>>
>>> This looks wrong, 2 * max is definitely too much.
>>
>> For a hash table to fit N elements, it has to have at least 4/3*N
>> slots, or 2*N slots if htab has the 50% load factor I was proposing.
>
> For var-tracking, 50% load factor is IMHO a very bad idea, memory
> consumption of var-tracking is already high right now, we IMHO don't have
> the luxury to waste more RAM.

Agreed, then I 'll change it to 4/3 * MAX so that I avoid expansions.

>> In total for dataflow_set_destroy I can see that calls to
>> attrs_list_clear() have been reduced from 500K to 250K, and I can
>> also see a reduction of free() calls from htab_delete(), from 30K to
>
> free calls?  If you avoid free calls, then you end up with a memory leak.
> I can understand when you avoid pool_free calls...

You are right, I just mentioned the total difference in free() calls from 
all my patches. But in this part there is no free() involved, so the major 
gain should be from avoiding htab_delete() iterating many times over the 
hash tables. Annotated source from the callgrind profiler (shows 
instruction count):

Before:

          .  void
15,820,597  htab_delete (htab_t htab)
    250,914  {
     41,819    size_t size = htab_size (htab);
     41,819    PTR *entries = htab->entries;
          .    int i;
          .
     83,638    if (htab->del_f)
66,493,884      for (i = size - 1; i >= 0; i--)
49,825,998        if (entries[i] != HTAB_EMPTY_ENTRY && entries[i] != HTAB_DELETED_ENTRY)
  1,813,018  	(*htab->del_f) (entries[i]);
  1,800,731  => tree-into-ssa.c:def_blocks_free (10988x)
    345,910  => tree-into-ssa.c:repl_map_free (2815x)
     53,950  => tree-scalar-evolution.c:del_scev_info (1197x)
    139,354  => tree-ssa-loop-im.c:memref_free (198x)
    281,777  => tree-ssa-sccvn.c:free_phi (3788x)
     81,726  => tree-ssa-uncprop.c:equiv_free (463x)
17,359,572  => var-tracking.c:variable_htab_free (835512x)
     42,161  => cfgloop.c:loop_exit_free (720x)
    284,904  => ira-costs.c:cost_classes_del (2270x)
        157  => tree-ssa-loop-im.c:vtoe_free (1x)
11,454,221  => ???:free (37228x)
  1,460,684  => tree-ssa-sccvn.c:free_reference (11329x)
          .
    125,457    if (htab->free_f != NULL)



After:

          .  void
  6,543,474  htab_delete (htab_t htab)
    250,914  {
     41,819    size_t size = htab_size (htab);
     41,819    PTR *entries = htab->entries;
          .    int i;
          .
     83,638    if (htab->del_f)
29,288,268      for (i = size - 1; i >= 0; i--)
21,927,330        if (entries[i] != HTAB_EMPTY_ENTRY && entries[i] != HTAB_DELETED_ENTRY)
  1,738,584  	(*htab->del_f) (entries[i]);
    139,344  => tree-ssa-loop-im.c:memref_free (198x)
        157  => tree-ssa-loop-im.c:vtoe_free (1x)
     81,762  => tree-ssa-uncprop.c:equiv_free (463x)
    281,884  => tree-ssa-sccvn.c:free_phi (3788x)
     42,145  => cfgloop.c:loop_exit_free (720x)
    345,870  => tree-into-ssa.c:repl_map_free (2815x)
  1,462,000  => tree-ssa-sccvn.c:free_reference (11329x)
  1,800,951  => tree-into-ssa.c:def_blocks_free (10988x)
16,518,004  => var-tracking.c:variable_htab_free (824010x)
    284,979  => ira-costs.c:cost_classes_del (2270x)
  9,080,245  => ???:free (11513x)
     53,921  => tree-scalar-evolution.c:del_scev_info (1197x)
          .
    125,457    if (htab->free_f != NULL)



So if the part relevant to this patch is the number of calls to 
variable_htab_free, I suppose it won't make a big difference.

But if I take a look at the top callers and top callees of htab_delete():

Before:

  19,590,149  < tree-ssa-structalias.c:solve_constraints (2058x) [cc1]
  33,299,064  < var-tracking.c:shared_hash_destroy (6923x) [cc1]
  68,941,719  < tree-ssa-pre.c:execute_pre (2058x) [cc1]
134,915,334  *  hashtab.c:htab_delete [cc1]
  17,359,572  >   var-tracking.c:variable_htab_free (835512x) [cc1]
  11,454,221  >   ???:free (108456x) [libc-2.12.so]
   1,800,731  >   tree-into-ssa.c:def_blocks_free (10988x) [cc1]


After:

   8,756,328  < tree-ssa-structalias.c:delete_points_to_sets (1029x) [cc1]
   9,316,165  < tree-ssa-dom.c:tree_ssa_dominator_optimize (562x) [cc1]
  83,696,211  < var-tracking.c:shared_hash_destroy (6923x) [cc1]
  60,459,493  *  hashtab.c:htab_delete [cc1]
  16,518,004  >   var-tracking.c:variable_htab_free (824010x) [cc1]
   9,080,245  >   ???:free (82741x) [libc-2.12.so]
   1,800,951  >   tree-into-ssa.c:def_blocks_free (10988x) [cc1]


Then I'm more confused, htab_delete() seems to be much colder, probably 
due to less traversals of hash tables, but It's not clear how much 
var-tracking changes are responsible for this. I'll keep in mind to revert 
the do_free parameter and report back. I guess for less than 10 M instr 
it's not worth the hassle.


Thanks,
Dimitris
diff mbox

Patch

=== modified file 'gcc/var-tracking.c'
--- gcc/var-tracking.c	2011-06-03 01:42:31 +0000
+++ gcc/var-tracking.c	2011-08-22 09:52:08 +0000
@@ -414,7 +414,6 @@  static hashval_t variable_htab_hash (con
 static int variable_htab_eq (const void *, const void *);
 static void variable_htab_free (void *);
 
-static void init_attrs_list_set (attrs *);
 static void attrs_list_clear (attrs *);
 static attrs attrs_list_member (attrs, decl_or_value, HOST_WIDE_INT);
 static void attrs_list_insert (attrs *, decl_or_value, HOST_WIDE_INT, rtx);
@@ -423,7 +422,6 @@  static void attrs_list_union (attrs *, a
 
 static void **unshare_variable (dataflow_set *set, void **slot, variable var,
 				enum var_init_status);
-static void vars_copy (htab_t, htab_t);
 static tree var_debug_decl (tree);
 static void var_reg_set (dataflow_set *, rtx, enum var_init_status, rtx);
 static void var_reg_delete_and_set (dataflow_set *, rtx, bool,
@@ -447,7 +445,7 @@  static bool variable_part_different_p (v
 static bool onepart_variable_different_p (variable, variable);
 static bool variable_different_p (variable, variable);
 static bool dataflow_set_different (dataflow_set *, dataflow_set *);
-static void dataflow_set_destroy (dataflow_set *);
+static void dataflow_set_destroy (dataflow_set *, bool);
 
 static bool contains_symbol_ref (rtx);
 static bool track_expr_p (tree, bool);
@@ -1069,14 +1067,14 @@  adjust_insn (basic_block bb, rtx insn)
 static inline bool
 dv_is_decl_p (decl_or_value dv)
 {
-  return !dv || (int) TREE_CODE ((tree) dv) != (int) VALUE;
+  return !dv || ((int) TREE_CODE ((tree) dv) != (int) VALUE);
 }
 
 /* Return true if a decl_or_value is a VALUE rtl.  */
 static inline bool
 dv_is_value_p (decl_or_value dv)
 {
-  return dv && !dv_is_decl_p (dv);
+  return !dv_is_decl_p (dv);
 }
 
 /* Return the decl in the decl_or_value.  */
@@ -1092,7 +1090,7 @@  static inline rtx
 dv_as_value (decl_or_value dv)
 {
   gcc_checking_assert (dv_is_value_p (dv));
-  return (rtx)dv;
+  return (rtx) dv;
 }
 
 /* Return the opaque pointer in the decl_or_value.  */
@@ -1191,7 +1189,7 @@  dv_uid2hash (dvuid uid)
 static inline hashval_t
 dv_htab_hash (decl_or_value dv)
 {
-  return dv_uid2hash (dv_uid (dv));
+  return (hashval_t) (dv_uid (dv));
 }
 
 /* The hash function for variable_htab, computes the hash value
@@ -1202,7 +1200,7 @@  variable_htab_hash (const void *x)
 {
   const_variable const v = (const_variable) x;
 
-  return dv_htab_hash (v->dv);
+  return (hashval_t) (dv_uid (v->dv));
 }
 
 /* Compare the declaration of variable X with declaration Y.  */
@@ -1211,9 +1209,8 @@  static int
 variable_htab_eq (const void *x, const void *y)
 {
   const_variable const v = (const_variable) x;
-  decl_or_value dv = CONST_CAST2 (decl_or_value, const void *, y);
 
-  return (dv_as_opaque (v->dv) == dv_as_opaque (dv));
+  return (v->dv) == y;
 }
 
 /* Free the element of VARIABLE_HTAB (its type is struct variable_def).  */
@@ -1265,17 +1262,6 @@  value_chain_htab_eq (const void *x, cons
   return dv_as_opaque (v->dv) == dv_as_opaque (dv);
 }
 
-/* Initialize the set (array) SET of attrs to empty lists.  */
-
-static void
-init_attrs_list_set (attrs *set)
-{
-  int i;
-
-  for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-    set[i] = NULL;
-}
-
 /* Make the list *LISTP empty.  */
 
 static void
@@ -1323,18 +1309,33 @@  attrs_list_insert (attrs *listp, decl_or
 static void
 attrs_list_copy (attrs *dstp, attrs src)
 {
-  attrs n;
+  attrs n, next, *nextp;
 
-  attrs_list_clear (dstp);
-  for (; src; src = src->next)
+  /* Copy to already existing nodes of *dstp, allocating new only if needed */
+  nextp = dstp;
+  while (src)
     {
-      n = (attrs) pool_alloc (attrs_pool);
+      if (*nextp == NULL)
+	{
+	  (*nextp) = (attrs) pool_alloc (attrs_pool);
+	  (*nextp)->next = NULL;
+	}
+      n = *nextp;
       n->loc = src->loc;
       n->dv = src->dv;
       n->offset = src->offset;
-      n->next = *dstp;
-      *dstp = n;
+      nextp = &n->next;
+      src = src->next;
     }
+  /* Free remaining elements of *dstp, if it was longer than src. */
+  n = *nextp;
+  while (n != NULL)
+    {
+      next = n->next;
+      pool_free (attrs_pool, n);
+      n = next;
+    }
+  (*nextp) = NULL;
 }
 
 /* Add all nodes from SRC which are not in *DSTP to *DSTP.  */
@@ -1397,19 +1398,40 @@  shared_var_p (variable var, shared_hash 
 	  || shared_hash_shared (vars));
 }
 
+/* Copy all variables from hash table SRC to hash table DST without rehashing
+   any values.  */
+
+static htab_t
+htab_dup (htab_t src)
+{
+  htab_t dst;
+
+  dst = (htab_t) xmalloc (sizeof (*src));
+  memcpy (dst, src, sizeof (*src));
+  dst->entries = (void **) xmalloc (src->size * sizeof (*src->entries));
+  memcpy (dst->entries, src->entries,
+	  src->size * sizeof (*src->entries));
+  return dst;
+}
+
 /* Copy variables into a new hash table.  */
 
 static shared_hash
 shared_hash_unshare (shared_hash vars)
 {
+  variable var;
+  htab_iterator hi;
   shared_hash new_vars = (shared_hash) pool_alloc (shared_hash_pool);
+
   gcc_assert (vars->refcount > 1);
   new_vars->refcount = 1;
-  new_vars->htab
-    = htab_create (htab_elements (vars->htab) + 3, variable_htab_hash,
-		   variable_htab_eq, variable_htab_free);
-  vars_copy (new_vars->htab, vars->htab);
+  new_vars->htab = htab_dup (vars->htab);
+  FOR_EACH_HTAB_ELEMENT (new_vars->htab, var, variable, hi)
+    {
+      var->refcount++;
+    }
   vars->refcount--;
+
   return new_vars;
 }
 
@@ -1426,13 +1448,17 @@  shared_hash_copy (shared_hash vars)
    anymore.  */
 
 static void
-shared_hash_destroy (shared_hash vars)
+shared_hash_destroy (shared_hash vars, bool do_free)
 {
   gcc_checking_assert (vars->refcount > 0);
   if (--vars->refcount == 0)
     {
+      if (!do_free)
+	/* Dirty hack to prevent htab_delete() iterating over all elements */
+	vars->htab->del_f = NULL;
       htab_delete (vars->htab);
-      pool_free (shared_hash_pool, vars);
+      if (do_free)
+	pool_free (shared_hash_pool, vars);
     }
 }
 
@@ -1593,25 +1619,6 @@  unshare_variable (dataflow_set *set, voi
   return slot;
 }
 
-/* Copy all variables from hash table SRC to hash table DST.  */
-
-static void
-vars_copy (htab_t dst, htab_t src)
-{
-  htab_iterator hi;
-  variable var;
-
-  FOR_EACH_HTAB_ELEMENT (src, var, variable, hi)
-    {
-      void **dstp;
-      var->refcount++;
-      dstp = htab_find_slot_with_hash (dst, var->dv,
-				       dv_htab_hash (var->dv),
-				       INSERT);
-      *dstp = var;
-    }
-}
-
 /* Map a decl to its main debug decl.  */
 
 static inline tree
@@ -2034,7 +2041,8 @@  val_resolve (dataflow_set *set, rtx val,
 static void
 dataflow_set_init (dataflow_set *set)
 {
-  init_attrs_list_set (set->regs);
+  /* Initialize the set (array) SET of attrs to empty lists.  */
+  memset (set->regs, 0, sizeof (set->regs));
   set->vars = shared_hash_copy (empty_shared_hash);
   set->stack_adjust = 0;
   set->traversed_vars = NULL;
@@ -2050,7 +2058,7 @@  dataflow_set_clear (dataflow_set *set)
   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
     attrs_list_clear (&set->regs[i]);
 
-  shared_hash_destroy (set->vars);
+  shared_hash_destroy (set->vars, true);
   set->vars = shared_hash_copy (empty_shared_hash);
 }
 
@@ -2064,7 +2072,7 @@  dataflow_set_copy (dataflow_set *dst, da
   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
     attrs_list_copy (&dst->regs[i], src->regs[i]);
 
-  shared_hash_destroy (dst->vars);
+  shared_hash_destroy (dst->vars, true);
   dst->vars = shared_hash_copy (src->vars);
   dst->stack_adjust = src->stack_adjust;
 }
@@ -2489,7 +2497,7 @@  dataflow_set_union (dataflow_set *dst, d
 
   if (dst->vars == empty_shared_hash)
     {
-      shared_hash_destroy (dst->vars);
+      shared_hash_destroy (dst->vars, true);
       dst->vars = shared_hash_copy (src->vars);
     }
   else
@@ -3711,11 +3719,11 @@  dataflow_set_merge (dataflow_set *dst, d
   src2_elems = htab_elements (shared_hash_htab (src2->vars));
   dataflow_set_init (dst);
   dst->stack_adjust = cur.stack_adjust;
-  shared_hash_destroy (dst->vars);
+  shared_hash_destroy (dst->vars, true);
   dst->vars = (shared_hash) pool_alloc (shared_hash_pool);
   dst->vars->refcount = 1;
   dst->vars->htab
-    = htab_create (MAX (src1_elems, src2_elems), variable_htab_hash,
+    = htab_create (2 * MAX (src1_elems, src2_elems), variable_htab_hash,
 		   variable_htab_eq, variable_htab_free);
 
   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
@@ -3734,7 +3742,7 @@  dataflow_set_merge (dataflow_set *dst, d
   if (dsm.src_onepart_cnt)
     dst_can_be_shared = false;
 
-  dataflow_set_destroy (src1);
+  dataflow_set_destroy (src1, true);
 }
 
 /* Mark register equivalences.  */
@@ -4503,14 +4511,15 @@  dataflow_set_different (dataflow_set *ol
 /* Free the contents of dataflow set SET.  */
 
 static void
-dataflow_set_destroy (dataflow_set *set)
+dataflow_set_destroy (dataflow_set *set, bool do_free)
 {
   int i;
 
-  for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
-    attrs_list_clear (&set->regs[i]);
+  if (do_free)
+    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
+      attrs_list_clear (&set->regs[i]);
 
-  shared_hash_destroy (set->vars);
+  shared_hash_destroy (set->vars, do_free);
   set->vars = NULL;
 }
 
@@ -6339,7 +6348,7 @@  compute_bb_dataflow (basic_block bb)
 #endif
     }
   changed = dataflow_set_different (&old_out, out);
-  dataflow_set_destroy (&old_out);
+  dataflow_set_destroy (&old_out, true);
   return changed;
 }
 
@@ -6456,7 +6465,7 @@  vt_find_locations (void)
 #endif
 		      if (dst_can_be_shared)
 			{
-			  shared_hash_destroy (in->vars);
+			  shared_hash_destroy (in->vars, true);
 			  in->vars = shared_hash_copy (first_out->vars);
 			}
 		    }
@@ -8343,7 +8352,8 @@  vt_emit_notes (void)
       gcc_assert (htab_elements (value_chains) == 0);
     }
 #endif
-  dataflow_set_destroy (&cur);
+  /* don't free memory here, we'll free pools right after in vt_finalize */
+  dataflow_set_destroy (&cur, false);
 
   if (MAY_HAVE_DEBUG_INSNS)
     {
@@ -8996,11 +9006,13 @@  vt_finalize (void)
 
   FOR_ALL_BB (bb)
     {
-      dataflow_set_destroy (&VTI (bb)->in);
-      dataflow_set_destroy (&VTI (bb)->out);
+      /* The "false" do_free parameter means to not bother to iterate and free
+	 all hash table elements, since we'll destroy the pools. */
+      dataflow_set_destroy (&VTI (bb)->in, false);
+      dataflow_set_destroy (&VTI (bb)->out, false);
       if (VTI (bb)->permp)
 	{
-	  dataflow_set_destroy (VTI (bb)->permp);
+	  dataflow_set_destroy (VTI (bb)->permp, false);
 	  XDELETE (VTI (bb)->permp);
 	}
     }