Patchwork [google] limit excessive load/store motions (issue4563044)

login
register
mail settings
Submitter Rong Xu
Date June 10, 2011, 5:45 p.m.
Message ID <20110610174509.076F8C4159@rong.mtv.corp.google.com>
Download mbox | patch
Permalink /patch/99940/
State New
Headers show

Comments

Rong Xu - June 10, 2011, 5:45 p.m.
Use a counter to avoid excessive load/store motions in tree and RTL level.
This recovers some of the performance loss in FDO mode for openssl.

2011-06-10  Rong Xu  <xur@google.com>

	* gcc/tree-ssa-loop-im.c (maxmimu_lsm): New define.
	(find_refs_for_sm): Limit excessive lsm.
	* gcc/gcse.c (cfgloop.h): New include.
	(maximum_lsm_limit): New define.
	(struct loop_lsm_limit_map): Ditto.
	(loop_lsm_limit_map_htab): Ditto.
	(loops_lsm): Ditto.
	(dominator_info_avail_before_lsm_limit): Ditto.
	(compute_ld_motion_mems): Limit execssive lsm.
	(find_loop_lsm_limit): New functions.
	(adjust_loop_lsm_limit): Ditto.
	(init_loop_lsm_limit): Ditto.
	(fini_loop_lsm_limit): Ditto.
	(estimate_reg_pressure_before_lsm): Ditto.
	(loop_lsm_limit_map_hash): Ditto.
	(loop_lsm_limit_map_eq): Ditto.
	(free_loop_lsm_limit_map): Ditto.


--
This patch is available for review at http://codereview.appspot.com/4563044
Diego Novillo - June 10, 2011, 5:52 p.m.
On Fri, Jun 10, 2011 at 10:45, Rong Xu <xur@google.com> wrote:
> Use a counter to avoid excessive load/store motions in tree and RTL level.
> This recovers some of the performance loss in FDO mode for openssl.

Rong, is this for trunk or google/main?  If it's for trunk, please
make sure you mark it as such.

OK for google/main for now.


Diego.
Rong Xu - June 10, 2011, 5:56 p.m.
This is for google/main for now. This is a port from my work in 4.4.3.
I may need to rework the register pressure estimation (try to see if I
can reuse the function in trunk) before submitting to trunk.

-Rong

On Fri, Jun 10, 2011 at 10:52 AM, Diego Novillo <dnovillo@google.com> wrote:
> On Fri, Jun 10, 2011 at 10:45, Rong Xu <xur@google.com> wrote:
>> Use a counter to avoid excessive load/store motions in tree and RTL level.
>> This recovers some of the performance loss in FDO mode for openssl.
>
> Rong, is this for trunk or google/main?  If it's for trunk, please
> make sure you mark it as such.
>
> OK for google/main for now.
>
>
> Diego.
>

Patch

Index: gcc/tree-ssa-loop-im.c
===================================================================
--- gcc/tree-ssa-loop-im.c	(revision 174918)
+++ gcc/tree-ssa-loop-im.c	(working copy)
@@ -97,6 +97,9 @@ 
 				   MAX_LOOP loop.  */
 };
 
+/* limit for lsm that can be performed for one loop */
+static unsigned maximum_lsm;
+
 /* Maps statements to their lim_aux_data.  */
 
 static struct pointer_map_t *lim_aux_data_map;
@@ -2359,12 +2362,16 @@ 
   unsigned i;
   bitmap_iterator bi;
   mem_ref_p ref;
+  unsigned sm_count = 0;
 
   EXECUTE_IF_AND_COMPL_IN_BITMAP (refs, sm_executed, 0, i, bi)
     {
       ref = VEC_index (mem_ref_p, memory_accesses.refs_list, i);
-      if (can_sm_ref_p (loop, ref))
-	bitmap_set_bit (refs_to_sm, i);
+      if (sm_count < maximum_lsm && can_sm_ref_p (loop, ref))
+        {
+          bitmap_set_bit (refs_to_sm, i);
+          ++sm_count;
+        }
     }
 }
 
@@ -2524,6 +2531,10 @@ 
   sbitmap_free (contains_call);
 
   lim_aux_data_map = pointer_map_create ();
+
+  /* Supress execeesive store-motion. Minus target_avail_regs by 1
+     for the induction varialbe. Maybe we should use even less?  */
+  maximum_lsm = (target_avail_regs < 1 ? 0 : target_avail_regs - 1);
 }
 
 /* Cleans up after the invariant motion pass.  */
Index: gcc/gcse.c
===================================================================
--- gcc/gcse.c	(revision 174918)
+++ gcc/gcse.c	(working copy)
@@ -171,6 +171,7 @@ 
 #include "dbgcnt.h"
 #include "target.h"
 #include "gcse.h"
+#include "cfgloop.h"
 
 /* We support GCSE via Partial Redundancy Elimination.  PRE optimizations
    are a superset of those done by classic GCSE.
@@ -543,6 +544,10 @@ 
 static bool do_local_cprop (rtx, rtx);
 static int local_cprop_pass (void);
 static bool is_too_expensive (const char *);
+static unsigned find_loop_lsm_limit (struct loop*);
+static void adjust_loop_lsm_limit (struct loop*, unsigned);
+static void init_loop_lsm_limit (void);
+static void fini_loop_lsm_limit (void);
 
 #define GNEW(T)			((T *) gmalloc (sizeof (T)))
 #define GCNEW(T)		((T *) gcalloc (1, sizeof (T)))
@@ -4979,6 +4984,10 @@ 
     }
 }
 
+/* The maximum number of load/store motions that can be performed for one
+   loop.  */
+static unsigned maximum_lsm_limit;
+
 /* Find all the 'simple' MEMs which are used in LOADs and STORES.  Simple
    being defined as MEM loads and stores to symbols, with no side effects
    and no registers in the expression.  For a MEM destination, we also
@@ -4994,12 +5003,18 @@ 
   basic_block bb;
   rtx insn;
 
+  init_loop_lsm_limit ();
+
   pre_ldst_mems = NULL;
   pre_ldst_table = htab_create (13, pre_ldst_expr_hash,
 				pre_ldst_expr_eq, NULL);
 
   FOR_EACH_BB (bb)
     {
+      struct loop *loop = bb->loop_father;
+      unsigned ld_motion_limit = find_loop_lsm_limit (loop);
+      unsigned ld_motion_count = 0;
+
       FOR_BB_INSNS (bb, insn)
 	{
 	  if (NONDEBUG_INSN_P (insn))
@@ -5032,12 +5047,25 @@ 
 		    {
 		      ptr = ldst_entry (dest);
 
-		      if (! MEM_P (src)
-			  && GET_CODE (src) != ASM_OPERANDS
-			  /* Check for REG manually since want_to_gcse_p
-			     returns 0 for all REGs.  */
-			  && can_assign_to_reg_without_clobbers_p (src))
-			ptr->stores = alloc_INSN_LIST (insn, ptr->stores);
+                      if (ld_motion_count >= ld_motion_limit)
+                        {
+                          if (dump_file)
+                            fprintf (dump_file,
+                                     "suppress ld_motion in loop=%d bb=%d due"
+                                     " to reg pressure.\n",
+                                     loop->num, bb->index);
+
+                          ptr->invalid = 1;
+                        }
+                      else if (! MEM_P (src)
+                          && GET_CODE (src) != ASM_OPERANDS
+                          /* Check for REG manually since want_to_gcse_p
+                             returns 0 for all REGs.  */
+                          && can_assign_to_reg_without_clobbers_p (src))
+                        {
+                          ptr->stores = alloc_INSN_LIST (insn, ptr->stores);
+                          ld_motion_count++;
+                        }
 		      else
 			ptr->invalid = 1;
 		    }
@@ -5046,9 +5074,201 @@ 
 		invalidate_any_buried_refs (PATTERN (insn));
 	    }
 	}
+      adjust_loop_lsm_limit (loop, ld_motion_count);
     }
+
+  fini_loop_lsm_limit ();
 }
 
+typedef struct 
+{
+  struct loop *loop;
+  unsigned lsm_limit;
+}loop_lsm_limit_map;
+
+/* hash_table that maps loop to lsm_limit  */
+static htab_t loop_lsm_limit_map_htab = NULL;
+
+/* hash function for loop_lsm_limit_map_htab */
+
+static hashval_t
+loop_lsm_limit_map_hash (const void *p)
+{
+  const loop_lsm_limit_map *const x = (const loop_lsm_limit_map*) p;
+  return htab_hash_pointer (x->loop);
+}
+
+/* hash equal function for loop_lsm_limit_map_htab */
+
+static int
+loop_lsm_limit_map_eq (const void *p1, const void *p2)
+{
+  const loop_lsm_limit_map *const ptr1 = (const loop_lsm_limit_map *) p1,
+    *const ptr2 = (const loop_lsm_limit_map *) p2;
+
+  return htab_eq_pointer (ptr1->loop, ptr2->loop);
+}
+
+/* free one entry in loop_lsm_limit_map_htab */
+
+static int
+free_loop_lsm_limit_map (void **slot, void *data ATTRIBUTE_UNUSED)
+{
+  free(*(loop_lsm_limit_map **) slot);
+  return 1;
+}
+
+/* use the loops strcture to find enclosing loop for each basic_block */
+static struct loops loops_lsm;
+static bool dominator_info_avail_before_lsm_limit = false;
+
+/* Initalization function for suppressing execessive load/store motions.
+   Build the loop structure so that we can count the number of motions 
+   performed. Set the maximum number of motions for a loop.  */
+
+static void
+init_loop_lsm_limit (void)
+{
+  dominator_info_avail_before_lsm_limit = dom_info_available_p(CDI_DOMINATORS);
+  flow_loops_find (&loops_lsm);
+
+  loop_lsm_limit_map_htab = htab_create (59, loop_lsm_limit_map_hash,
+                                         loop_lsm_limit_map_eq, NULL);
+
+  /* avail_regs minus the induction variables  */
+  maximum_lsm_limit = target_avail_regs - 1;
+}
+
+/* Cleanup function: destroy loop structure and free space.  */
+
+static void
+fini_loop_lsm_limit (void)
+{
+  basic_block bb;
+
+  flow_loops_free (&loops_lsm);
+
+  if (!dominator_info_avail_before_lsm_limit)
+    free_dominance_info (CDI_DOMINATORS);
+  
+  FOR_ALL_BB (bb)
+    bb->loop_father = NULL;
+
+  if (loop_lsm_limit_map_htab)
+    {
+      htab_traverse (loop_lsm_limit_map_htab, free_loop_lsm_limit_map, NULL);
+      htab_delete (loop_lsm_limit_map_htab);
+    }
+}
+
+/* This routine tries to find the number of registers that
+   (1) live across the iteration, and
+   (2) referenced in the loop.
+   those store-motions performed in tree-lim falls into this category.  */
+
+static unsigned
+estimate_reg_pressure_before_lsm (struct loop *loop)
+{
+  basic_block *body;
+  unsigned int i;
+  unsigned int count;
+  regset live_across_iteration;
+  regset used_in_loop;
+  basic_block header, latch;
+
+  if (!loop)
+    return 0;
+
+  gcc_assert (loop != loops_lsm.tree_root);
+
+  latch = loop->latch;
+  /* don't handle multiple latches */
+  if (!latch)
+    return 0;
+
+  header = loop->header;
+  gcc_assert (header);
+
+  live_across_iteration = ALLOC_REG_SET (&reg_obstack);
+  COPY_REG_SET (live_across_iteration, DF_LR_IN (header));
+  AND_REG_SET (live_across_iteration, DF_LR_OUT (latch));
+
+  used_in_loop = ALLOC_REG_SET (&reg_obstack);
+  CLEAR_REG_SET (used_in_loop);
+
+  body = get_loop_body (loop);
+  /* this loop computes a subset of must-use */
+  for (i=0; i<loop->num_nodes; i++)
+    {
+       basic_block bb = *(body + i);
+
+       if (dominated_by_p (CDI_DOMINATORS, latch, bb))
+         IOR_REG_SET (used_in_loop, &(DF_LR_BB_INFO (bb)->use));
+    }
+
+
+  AND_REG_SET (live_across_iteration, used_in_loop);
+
+  count = bitmap_count_bits (live_across_iteration);
+
+  FREE_REG_SET (live_across_iteration);
+  FREE_REG_SET (used_in_loop);
+
+  return count;
+}
+
+/* Find the number of load/store motion we can perform without
+   create excessive register pressure to loop LOOP  */
+
+static unsigned
+find_loop_lsm_limit (struct loop *loop)
+{
+  void **slot;
+  loop_lsm_limit_map t, *ret;
+  unsigned reg_pressure;
+  unsigned limit = maximum_lsm_limit;
+
+  if (!loop || loop == loops_lsm.tree_root)
+    return (unsigned)-1;
+
+  t.loop = loop;
+  slot = htab_find_slot (loop_lsm_limit_map_htab, &t, NO_INSERT);
+  if (slot)
+    return (*(loop_lsm_limit_map**)slot)->lsm_limit;
+
+  /* insert this loop and initialize it  */
+  reg_pressure = estimate_reg_pressure_before_lsm (loop);
+  ret = (loop_lsm_limit_map*) gmalloc (sizeof (loop_lsm_limit_map));
+  gcc_assert (ret);
+
+  limit = (limit > reg_pressure ? limit - reg_pressure : 0);
+  ret->loop = loop;
+  ret->lsm_limit = limit;
+  slot = htab_find_slot (loop_lsm_limit_map_htab, ret, INSERT);
+  gcc_assert (slot);
+  (*slot) = (void**)ret;
+
+  return limit;
+}
+
+static void
+adjust_loop_lsm_limit (struct loop *loop, unsigned performed)
+{
+  void **slot;
+  loop_lsm_limit_map t;
+  unsigned limit;
+
+  if (performed == 0 || loop == NULL || loop == loops_lsm.tree_root)
+    return;
+
+  t.loop = loop;
+  slot = htab_find_slot (loop_lsm_limit_map_htab, &t, NO_INSERT);
+  gcc_assert (slot);
+  limit = (*(loop_lsm_limit_map**)slot)->lsm_limit;
+  gcc_assert (limit >= performed);
+  (*(loop_lsm_limit_map**)slot)->lsm_limit = limit - performed;
+}
+
 /* Remove any references that have been either invalidated or are not in the
    expression list for pre gcse.  */