diff mbox

0006-GCSE-complex-constants.patch

Message ID 4C226A4F.8040808@codesourcery.com
State New
Headers show

Commit Message

Maxim Kuvyrkov June 23, 2010, 8:10 p.m. UTC
On 6/16/10 8:54 PM, Jeff Law wrote:
>> Certain architectures (e.g., ARM) cannot easily operate with
>> constants, they need to emit sequences of several instructions to load
>> constants into registers. The common procedure to do this is to emit a
>> (parallel [(set) (clobber (reg1)) ... (clobber (regN))]) instruction
>> which later splits into several instructions using pseudos (regX) to
>> store intermediate values.
>>
>> Currently PRE and hoist do not GCSE constants, and there is a good
>> reason for that, to avoid increasing register pressure; interestingly,
>> symbol_refs are allowed to be GCSE'ed, is this intentional or by
>> accident?
> It's intentional; a SYMBOL_REF if often be rather expensive. Some
> CONST_INTs can have that same property. One could argue that an
> expensive CONST_INT shouldn't be appearing in RTL, but certainly some
> ports have chosen to handle splitting insns with expensive constants
> later in the pipeline.
>
>>
>> In any case, it seems like a good idea to GCSE constants and
>> symbol_refs that need something beyond a simple (set) to get into a
>> register, and not GCSE them otherwise.
> Rather than triggering this on the PARALLEL it might be better to
> trigger it on the cost of the RTX. Triggering on the PARALLEL looks like
> a hack to me -- IMHO we'd be better off fixing the costing mechanism and
> using costing as the trigger.

Here is reworked patch that

(a) introduces max_distance property to expressions (counted in 
instructions),

(b) uses RTX cost model to estimate how far an expression can travel 
(the greater the cost, the farther the distance), and

(c) adds two new parameters to tweak the above.

I am yet to do benchmarking on ARM and x86[_64] to find out what the 
optimal parameter values are.  Before starting with testing I would like 
to get feedback on the concept and its implementation.

Thank you,
diff mbox

Patch

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 05ebcf0..ebdd5c1 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -8195,6 +8195,19 @@  when @option{-ftree-vectorize} is used.  The number of iterations after
 vectorization needs to be greater than the value specified by this option
 to allow vectorization.  The default value is 0.
 
+@item gcse-cost-distance-ratio
+Scaling factor in calculation of maximum distance an expression
+can be moved by GCSE optimizations.  This is currently supported only in
+code hoisting pass.  The bigger the ratio, the more agressive code hoisting
+will be with expressions which have cost less than
+@option{gcse-unrestricted-cost}.  The default value is 2.
+
+@item gcse-unrestricted-cost
+Cost at which GCSE optimizations will not constraint the distance
+an expression can travel.  This is currently supported only in
+code hoisting pass.  The lesser the cost, the more aggressive code hoisting
+will be.  The default value is 3.
+
 @item max-hoist-depth
 The depth of search in the dominator tree for expressions to hoist.
 This is used to avoid quadratic behavior in hoisting algorithm.
diff --git a/gcc/gcse.c b/gcc/gcse.c
index e323db1..45eb7bc 100644
--- a/gcc/gcse.c
+++ b/gcc/gcse.c
@@ -295,6 +295,12 @@  struct expr
      The value is the newly created pseudo-reg to record a copy of the
      expression in all the places that reach the redundant copy.  */
   rtx reaching_reg;
+  /* Maximum distance in instructions this expression can travel.
+     We avoid moving simple expressions for more than a few instructions
+     to keep register pressure under control.
+     A value of "0" removes restrictions on how far the expression can
+     travel.  */
+  int max_distance;
 };
 
 /* Occurrence of an expression.
@@ -431,12 +437,12 @@  static void hash_scan_insn (rtx, struct hash_table_d *);
 static void hash_scan_set (rtx, rtx, struct hash_table_d *);
 static void hash_scan_clobber (rtx, rtx, struct hash_table_d *);
 static void hash_scan_call (rtx, rtx, struct hash_table_d *);
-static int want_to_gcse_p (rtx);
+static int want_to_gcse_p (rtx, int *);
 static bool gcse_constant_p (const_rtx);
 static int oprs_unchanged_p (const_rtx, const_rtx, int);
 static int oprs_anticipatable_p (const_rtx, const_rtx);
 static int oprs_available_p (const_rtx, const_rtx);
-static void insert_expr_in_table (rtx, enum machine_mode, rtx, int, int,
+static void insert_expr_in_table (rtx, enum machine_mode, rtx, int, int, int,
 				  struct hash_table_d *);
 static void insert_set_in_table (rtx, rtx, struct hash_table_d *);
 static unsigned int hash_expr (const_rtx, enum machine_mode, int *, int);
@@ -496,7 +502,8 @@  static void alloc_code_hoist_mem (int, int);
 static void free_code_hoist_mem (void);
 static void compute_code_hoist_vbeinout (void);
 static void compute_code_hoist_data (void);
-static int hoist_expr_reaches_here_p (basic_block, int, basic_block, char *);
+static int hoist_expr_reaches_here_p (basic_block, int, basic_block, char *,
+				      int, int *);
 static int hoist_code (void);
 static int one_code_hoisting_pass (void);
 static rtx process_insert_insn (struct expr *);
@@ -754,8 +761,11 @@  static basic_block current_bb;
    GCSE.  */
 
 static int
-want_to_gcse_p (rtx x)
+want_to_gcse_p (rtx x, int *max_distance_ptr)
 {
+  int cost;
+  int max_distance;
+
 #ifdef STACK_REGS
   /* On register stack architectures, don't GCSE constants from the
      constant pool, as the benefits are often swamped by the overhead
@@ -764,14 +774,42 @@  want_to_gcse_p (rtx x)
     x = avoid_constant_pool_reference (x);
 #endif
 
+  /* GCSE'ing constants:
+
+     We do not specifically distinguish between constant and non-constant
+     expressions in PRE and Hoist.  We use rtx_cost below to limit
+     the maximum distance simple expressions can travel.
+
+     Nevertheless, constants are much easier to GCSE, and, hence,
+     it is easy to overdo the optimizations.  Usually, excessive PRE and
+     Hoisting of constant leads to increased register pressure.
+
+     RA can deal with this by rematerialing some of the constants.
+     Therefore, it is important that the back-end generates sets of constants
+     in a way that allows reload rematerialize them under high register
+     pressure, i.e., a pseudo register with REG_EQUAL to constant
+     is set only once.  Failing to do so will result in IRA/reload
+     spilling such constants under high register pressure instead of
+     rematerializing them.  */
+
+  cost = rtx_cost (x, SET, optimize_function_for_speed_p (cfun));
+
+  if (cost < COSTS_N_INSNS (GCSE_UNRESTRICTED_COST))
+    {
+      max_distance = GCSE_COST_DISTANCE_RATIO * cost;
+      if (max_distance == 0)
+	return 0;
+    }
+  else
+    max_distance = 0;
+
+  if (max_distance_ptr)
+    *max_distance_ptr = max_distance;
+
   switch (GET_CODE (x))
     {
     case REG:
     case SUBREG:
-    case CONST_INT:
-    case CONST_DOUBLE:
-    case CONST_FIXED:
-    case CONST_VECTOR:
     case CALL:
       return 0;
 
@@ -1089,11 +1127,15 @@  expr_equiv_p (const_rtx x, const_rtx y)
    It is only used if X is a CONST_INT.
 
    ANTIC_P is nonzero if X is an anticipatable expression.
-   AVAIL_P is nonzero if X is an available expression.  */
+   AVAIL_P is nonzero if X is an available expression.
+
+   MAX_DISTANCE is the maximum distance in instructions this expression can
+   be moved.
+*/
 
 static void
 insert_expr_in_table (rtx x, enum machine_mode mode, rtx insn, int antic_p,
-		      int avail_p, struct hash_table_d *table)
+		      int avail_p, int max_distance, struct hash_table_d *table)
 {
   int found, do_not_record_p;
   unsigned int hash;
@@ -1136,7 +1178,10 @@  insert_expr_in_table (rtx x, enum machine_mode mode, rtx insn, int antic_p,
       cur_expr->next_same_hash = NULL;
       cur_expr->antic_occr = NULL;
       cur_expr->avail_occr = NULL;
+      cur_expr->max_distance = max_distance;
     }
+  else
+    gcc_assert (cur_expr->max_distance == max_distance);
 
   /* Now record the occurrence(s).  */
   if (antic_p)
@@ -1237,6 +1282,7 @@  insert_set_in_table (rtx x, rtx insn, struct hash_table_d *table)
       cur_expr->next_same_hash = NULL;
       cur_expr->antic_occr = NULL;
       cur_expr->avail_occr = NULL;
+      cur_expr->max_distance = 0; /* Not used for set_p tables.  */
     }
 
   /* Now record the occurrence.  */
@@ -1306,6 +1352,7 @@  hash_scan_set (rtx pat, rtx insn, struct hash_table_d *table)
     {
       unsigned int regno = REGNO (dest);
       rtx tmp;
+      int max_distance = 0;
 
       /* See if a REG_EQUAL note shows this equivalent to a simpler expression.
 
@@ -1328,7 +1375,7 @@  hash_scan_set (rtx pat, rtx insn, struct hash_table_d *table)
 	  && !REG_P (src)
 	  && (table->set_p
 	      ? gcse_constant_p (XEXP (note, 0))
-	      : want_to_gcse_p (XEXP (note, 0))))
+	      : want_to_gcse_p (XEXP (note, 0), NULL)))
 	src = XEXP (note, 0), pat = gen_rtx_SET (VOIDmode, dest, src);
 
       /* Only record sets of pseudo-regs in the hash table.  */
@@ -1343,7 +1390,7 @@  hash_scan_set (rtx pat, rtx insn, struct hash_table_d *table)
 	     can't do the same thing at the rtl level.  */
 	  && !can_throw_internal (insn)
 	  /* Is SET_SRC something we want to gcse?  */
-	  && want_to_gcse_p (src)
+	  && want_to_gcse_p (src, &max_distance)
 	  /* Don't CSE a nop.  */
 	  && ! set_noop_p (pat)
 	  /* Don't GCSE if it has attached REG_EQUIV note.
@@ -1367,7 +1414,8 @@  hash_scan_set (rtx pat, rtx insn, struct hash_table_d *table)
 	  int avail_p = (oprs_available_p (src, insn)
 			 && ! JUMP_P (insn));
 
-	  insert_expr_in_table (src, GET_MODE (dest), insn, antic_p, avail_p, table);
+	  insert_expr_in_table (src, GET_MODE (dest), insn, antic_p, avail_p,
+				max_distance, table);
 	}
 
       /* Record sets for constant/copy propagation.  */
@@ -1404,7 +1452,7 @@  hash_scan_set (rtx pat, rtx insn, struct hash_table_d *table)
 	      do that easily for EH edges so disable GCSE on these for now.  */
 	   && !can_throw_internal (insn)
 	   /* Is SET_DEST something we want to gcse?  */
-	   && want_to_gcse_p (dest)
+	   && want_to_gcse_p (dest, NULL)
 	   /* Don't CSE a nop.  */
 	   && ! set_noop_p (pat)
 	   /* Don't GCSE if it has attached REG_EQUIV note.
@@ -1425,7 +1473,7 @@  hash_scan_set (rtx pat, rtx insn, struct hash_table_d *table)
 			     && ! JUMP_P (insn);
 
 	       /* Record the memory expression (DEST) in the hash table.  */
-	       insert_expr_in_table (dest, GET_MODE (dest), insn,
+	       insert_expr_in_table (dest, GET_MODE (dest), insn, 0,
 				     antic_p, avail_p, table);
              }
       }
@@ -1512,8 +1560,8 @@  dump_hash_table (FILE *file, const char *name, struct hash_table_d *table)
     if (flat_table[i] != 0)
       {
 	expr = flat_table[i];
-	fprintf (file, "Index %d (hash value %d)\n  ",
-		 expr->bitmap_index, hash_val[i]);
+	fprintf (file, "Index %d (hash value %d; max distance %d)\n  ",
+		 expr->bitmap_index, hash_val[i], expr->max_distance);
 	print_rtl (file, expr->expr);
 	fprintf (file, "\n");
       }
@@ -4284,6 +4332,8 @@  compute_code_hoist_data (void)
 
 /* Determine if the expression identified by EXPR_INDEX would
    reach BB unimpared if it was placed at the end of EXPR_BB.
+   Stop the search if the expression would need to be moved more
+   than DISTANCE instructions.
 
    It's unclear exactly what Muchnick meant by "unimpared".  It seems
    to me that the expression must either be computed or transparent in
@@ -4296,7 +4346,8 @@  compute_code_hoist_data (void)
    paths.  */
 
 static int
-hoist_expr_reaches_here_p (basic_block expr_bb, int expr_index, basic_block bb, char *visited)
+hoist_expr_reaches_here_p (basic_block expr_bb, int expr_index, basic_block bb,
+			   char *visited, int distance, int *bb_size)
 {
   edge pred;
   edge_iterator ei;
@@ -4309,6 +4360,18 @@  hoist_expr_reaches_here_p (basic_block expr_bb, int expr_index, basic_block bb,
       visited = XCNEWVEC (char, last_basic_block);
     }
 
+  /* Terminate the search if distance, for which EXPR is allowed to move,
+     is exhausted.  */
+  if (distance > 0)
+    {
+      distance -= bb_size[bb->index];
+
+      if (distance <= 0)
+	return 0;
+    }
+  else
+    gcc_assert (distance == 0);
+
   FOR_EACH_EDGE (pred, ei, bb->preds)
     {
       basic_block pred_bb = pred->src;
@@ -4330,8 +4393,8 @@  hoist_expr_reaches_here_p (basic_block expr_bb, int expr_index, basic_block bb,
       else
 	{
 	  visited[pred_bb->index] = 1;
-	  if (! hoist_expr_reaches_here_p (expr_bb, expr_index,
-					   pred_bb, visited))
+	  if (! hoist_expr_reaches_here_p (expr_bb, expr_index, pred_bb,
+					   visited, distance, bb_size))
 	    break;
 	}
     }
@@ -4341,6 +4404,19 @@  hoist_expr_reaches_here_p (basic_block expr_bb, int expr_index, basic_block bb,
   return (pred == NULL);
 }
 
+/* Find anticipatable occurence of EXPR in BB.  */
+static struct occr *
+find_antic_occr_in_bb (struct expr *expr, basic_block bb)
+{
+  struct occr *occr = expr->antic_occr;
+
+  /* Find the right occurrence of this expression.  */
+  while (BLOCK_FOR_INSN (occr->insn) != bb && occr)
+    occr = occr->next;
+
+  return occr;
+}
+
 /* Actually perform code hoisting.  */
 
 static int
@@ -4351,6 +4427,7 @@  hoist_code (void)
   unsigned int i,j;
   struct expr **index_map;
   struct expr *expr;
+  int *bb_size;
   int changed = 0;
 
   sbitmap_vector_zero (hoist_exprs, last_basic_block);
@@ -4363,6 +4440,28 @@  hoist_code (void)
     for (expr = expr_hash_table.table[i]; expr != NULL; expr = expr->next_same_hash)
       index_map[expr->bitmap_index] = expr;
 
+  bb_size = XCNEWVEC (int, last_basic_block);
+  FOR_EACH_BB (bb)
+    {
+      rtx bb_head;
+      rtx bb_end;
+
+      bb_head = next_nonnote_insn (BB_HEAD (bb));
+      bb_end = BB_END (bb);
+      bb_end = INSN_P (bb_end) ? bb_end : prev_nonnote_insn (bb_end);
+
+      if (bb_head && BLOCK_FOR_INSN (bb_head) == bb
+	  && bb_end && BLOCK_FOR_INSN (bb_end) == bb)
+	{
+	  gcc_assert (INSN_P (bb_head) && INSN_P (bb_end));
+	  bb_size[bb->index] = (DF_INSN_LUID (bb_end) - DF_INSN_LUID (bb_head)
+				+ 1);
+	  gcc_assert (bb_size[bb->index] >= 1);
+	}
+      else
+	bb_size[bb->index] = 0;
+    }
+
   /* Walk over each basic block looking for potentially hoistable
      expressions, nothing gets hoisted from the entry block.  */
   FOR_EACH_BB (bb)
@@ -4391,6 +4490,8 @@  hoist_code (void)
 		 computes the expression.  */
 	      for (j = 0; VEC_iterate (basic_block, domby, j, dominated); j++)
 		{
+		  int max_distance;
+
 		  /* Ignore self dominance.  */
 		  if (bb == dominated)
 		    continue;
@@ -4400,12 +4501,42 @@  hoist_code (void)
 		  if (!TEST_BIT (antloc[dominated->index], i))
 		    continue;
 
+		  expr = index_map[i];
+
+		  max_distance = expr->max_distance;
+		  if (max_distance > 0)
+		    {
+		      struct occr *occr;
+
+		      occr = find_antic_occr_in_bb (expr, dominated);
+		      gcc_assert (occr);
+
+		      /* An occurence might've been already deleted
+			 while processing a dominator of BB.  */
+		      if (occr->deleted_p)
+			{
+			  gcc_assert (MAX_HOIST_DEPTH > 1);
+			  continue;
+			}
+
+		      /* Adjust MAX_DISTANCE to account for the fact that
+			 OCCR won't have to travel all of DOMINATED, but
+			 only part of it.
+			 Note: DF_INSN_LUIDs should be used cautiously once
+			 we start emitting new instructions.  Luckily, we
+			 are sure that occr->insn was present at the time of
+			 df_analyze, so it has valid DF_INSN_LUID.  */
+		      max_distance += (bb_size[dominated->index]
+				       - DF_INSN_LUID (occr->insn));
+		    }
+
 		  /* Note if the expression would reach the dominated block
 		     unimpared if it was placed at the end of BB.
 
 		     Keep track of how many times this expression is hoistable
 		     from a dominated block into BB.  */
-		  if (hoist_expr_reaches_here_p (bb, i, dominated, NULL))
+		  if (hoist_expr_reaches_here_p (bb, i, dominated, NULL,
+						 max_distance, bb_size))
 		    hoistable++;
 		}
 
@@ -4448,6 +4579,9 @@  hoist_code (void)
 		 computes the expression.  */
 	      for (j = 0; VEC_iterate (basic_block, domby, j, dominated); j++)
 		{
+		  struct occr *occr = NULL;
+		  int max_distance;
+
 		  /* Ignore self dominance.  */
 		  if (bb == dominated)
 		    continue;
@@ -4458,6 +4592,33 @@  hoist_code (void)
 		  if (!TEST_BIT (antloc[dominated->index], i))
 		    continue;
 
+		  expr = index_map[i];
+
+		  max_distance = expr->max_distance;
+		  if (max_distance > 0)
+		    {
+		      occr = find_antic_occr_in_bb (index_map[i], dominated);
+		      gcc_assert (occr);
+
+		      /* An occurence might've been already deleted
+			 while processing a dominator of BB.  */
+		      if (occr->deleted_p)
+			{
+			  gcc_assert (MAX_HOIST_DEPTH > 1);
+			  continue;
+			}
+
+		      /* Adjust MAX_DISTANCE to account for the fact that
+			 OCCR won't have to travel all of DOMINATED, but
+			 only part of it.
+			 Note: DF_INSN_LUIDs should be used cautiously once
+			 we start emitting new instructions.  Luckily, we
+			 are sure that occr->insn was present at the time of
+			 df_analyze, so it has valid DF_INSN_LUID.  */
+		      max_distance += (bb_size[dominated->index]
+				       - DF_INSN_LUID (occr->insn));
+		    }
+
 		  /* The expression is computed in the dominated block and
 		     it would be safe to compute it at the start of the
 		     dominated block.  Now we have to determine if the
@@ -4467,23 +4628,25 @@  hoist_code (void)
 		     that /some/, not necesserilly all, occurences from
 		     the dominated blocks can be hoisted to BB.  Here we check
 		     if a specific occurence can be hoisted to BB.  */
-		  if (hoist_expr_reaches_here_p (bb, i, dominated, NULL))
+		  if (hoist_expr_reaches_here_p (bb, i, dominated, NULL,
+						 max_distance, bb_size))
 		    {
-		      struct expr *expr = index_map[i];
-		      struct occr *occr = expr->antic_occr;
 		      rtx insn;
 		      rtx set;
 
-		      /* Find the right occurrence of this expression.  */
-		      while (BLOCK_FOR_INSN (occr->insn) != dominated && occr)
-			occr = occr->next;
-
-		      gcc_assert (occr);
+		      if (!occr)
+			{
+			  occr = find_antic_occr_in_bb (expr, dominated);
+			  gcc_assert (occr);
+			}
 
 		      /* An occurence might've been already deleted
 			 while processing a dominator of BB.  */
 		      if (occr->deleted_p)
-			continue;
+			{
+			  gcc_assert (MAX_HOIST_DEPTH > 1);
+			  continue;
+			}
 
 		      insn = occr->insn;
 		      set = single_set (insn);
@@ -4519,6 +4682,7 @@  hoist_code (void)
       VEC_free (basic_block, heap, domby);
     }
 
+  free (bb_size);
   free (index_map);
 
   return changed;
diff --git a/gcc/params.def b/gcc/params.def
index f08d482..2329767 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -219,6 +219,21 @@  DEFPARAM(PARAM_GCSE_AFTER_RELOAD_CRITICAL_FRACTION,
 	"gcse-after-reload-critical-fraction",
 	"The threshold ratio of critical edges execution count that permit performing redundancy elimination after reload",
         10, 0, 0)
+
+/* GCSE will use GCSE_COST_DISTANCE_RATION as a scaling factor
+   to calculate maximum distance for which an expression is allowed to move
+   from its rtx_cost.  */
+DEFPARAM(PARAM_GCSE_COST_DISTANCE_RATIO,
+	 "gcse-cost-distance-ratio",
+	 "Scaling factor in calculation of maximum distance an expression can be moved by GCSE optimizations",
+	 2, 0, 0)
+/* GCSE won't restrict distance for which an expression with rtx_cost greater
+   than COSTS_N_INSN(GCSE_UNRESTRICTED_COST) is allowed to move.  */
+DEFPARAM(PARAM_GCSE_UNRESTRICTED_COST,
+	 "gcse-unrestricted-cost",
+	 "Cost at which GCSE optimizations will not constraint the distance an expression can travel",
+	 3, 0, 0)
+
 /* How deep from a given basic block the dominator tree should be searched
    for expressions to hoist to the block.  The value of 0 will avoid limiting
    the search.  */
diff --git a/gcc/params.h b/gcc/params.h
index c0404ca..aa96c81 100644
--- a/gcc/params.h
+++ b/gcc/params.h
@@ -125,6 +125,10 @@  typedef enum compiler_param
   PARAM_VALUE (PARAM_GCSE_AFTER_RELOAD_PARTIAL_FRACTION)
 #define GCSE_AFTER_RELOAD_CRITICAL_FRACTION \
   PARAM_VALUE (PARAM_GCSE_AFTER_RELOAD_CRITICAL_FRACTION)
+#define GCSE_COST_DISTANCE_RATIO \
+  PARAM_VALUE (PARAM_GCSE_COST_DISTANCE_RATIO)
+#define GCSE_UNRESTRICTED_COST \
+  PARAM_VALUE (PARAM_GCSE_UNRESTRICTED_COST)
 #define MAX_HOIST_DEPTH \
   PARAM_VALUE (PARAM_MAX_HOIST_DEPTH)
 #define MAX_UNROLLED_INSNS \