diff mbox series

[3/8] aarch64: Split out aarch64_adjust_body_cost_sve

Message ID mpt4kc6d994.fsf@arm.com
State New
Headers show
Series aarch64 vector cost tweaks | expand

Commit Message

Richard Sandiford Aug. 3, 2021, 12:04 p.m. UTC
This patch splits the SVE-specific part of aarch64_adjust_body_cost
out into its own subroutine, so that a future patch can call it
more than once.  I wondered about using a lambda to avoid having
to pass all the arguments, but in the end this way seemed clearer.

gcc/
	* config/aarch64/aarch64.c (aarch64_adjust_body_cost_sve): New
	function, split out from...
	(aarch64_adjust_body_cost): ...here.
---
 gcc/config/aarch64/aarch64.c | 220 ++++++++++++++++++++---------------
 1 file changed, 127 insertions(+), 93 deletions(-)
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 17fcb34b2c8..b14b6f22aec 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -15488,6 +15488,126 @@  aarch64_estimate_min_cycles_per_iter
   return cycles;
 }
 
+/* Subroutine of aarch64_adjust_body_cost for handling SVE.
+   Use ISSUE_INFO to work out how fast the SVE code can be issued and compare
+   it to the equivalent value for scalar code (SCALAR_CYCLES_PER_ITER).
+   If COULD_USE_ADVSIMD is true, also compare it to the issue rate of
+   Advanced SIMD code (ADVSIMD_CYCLES_PER_ITER).
+
+   COSTS is as for aarch64_adjust_body_cost.  ORIG_BODY_COST is the cost
+   originally passed to aarch64_adjust_body_cost and *BODY_COST is the current
+   value of the adjusted cost.  *SHOULD_DISPARAGE is true if we think the loop
+   body is too expensive.  */
+
+static fractional_cost
+aarch64_adjust_body_cost_sve (const aarch64_vector_costs *costs,
+			      const aarch64_vec_issue_info *issue_info,
+			      fractional_cost scalar_cycles_per_iter,
+			      fractional_cost advsimd_cycles_per_iter,
+			      bool could_use_advsimd,
+			      unsigned int orig_body_cost,
+			      unsigned int *body_cost,
+			      bool *should_disparage)
+{
+  /* Estimate the minimum number of cycles per iteration needed to issue
+     non-predicate operations.  */
+  fractional_cost sve_nonpred_cycles_per_iter
+    = aarch64_estimate_min_cycles_per_iter (&costs->sve_ops,
+					    issue_info->sve);
+
+  /* Separately estimate the minimum number of cycles per iteration needed
+     to issue the predicate operations.  */
+  fractional_cost sve_pred_issue_cycles_per_iter
+    = { costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle };
+
+  /* Calculate the overall limit on the number of cycles per iteration.  */
+  fractional_cost sve_cycles_per_iter
+    = std::max (sve_nonpred_cycles_per_iter, sve_pred_issue_cycles_per_iter);
+
+  if (dump_enabled_p ())
+    {
+      costs->sve_ops.dump ();
+      dump_printf_loc (MSG_NOTE, vect_location,
+		       "  estimated cycles per iteration = %f\n",
+		       sve_cycles_per_iter.as_double ());
+      dump_printf_loc (MSG_NOTE, vect_location,
+		       "  estimated cycles per iteration for non-predicate"
+		       " operations = %f\n",
+		       sve_nonpred_cycles_per_iter.as_double ());
+      if (costs->sve_ops.pred_ops)
+	dump_printf_loc (MSG_NOTE, vect_location, "  estimated cycles per"
+			 " iteration for predicate operations = %d\n",
+			 sve_pred_issue_cycles_per_iter.as_double ());
+    }
+
+  /* If the scalar version of the loop could issue at least as
+     quickly as the predicate parts of the SVE loop, make the SVE loop
+     prohibitively expensive.  In this case vectorization is adding an
+     overhead that the original scalar code didn't have.
+
+     This is mostly intended to detect cases in which WHILELOs dominate
+     for very tight loops, which is something that normal latency-based
+     costs would not model.  Adding this kind of cliffedge would be
+     too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
+     code in the caller handles that case in a more conservative way.  */
+  fractional_cost sve_estimate = sve_pred_issue_cycles_per_iter + 1;
+  if (scalar_cycles_per_iter < sve_estimate)
+    {
+      unsigned int min_cost
+	= orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
+      if (*body_cost < min_cost)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Increasing body cost to %d because the"
+			     " scalar code could issue within the limit"
+			     " imposed by predicate operations\n",
+			     min_cost);
+	  *body_cost = min_cost;
+	  *should_disparage = true;
+	}
+    }
+
+  /* If it appears that the Advanced SIMD version of a loop could issue
+     more quickly than the SVE one, increase the SVE cost in proportion
+     to the difference.  The intention is to make Advanced SIMD preferable
+     in cases where an Advanced SIMD version exists, without increasing
+     the costs so much that SVE won't be used at all.
+
+     The reasoning is similar to the scalar vs. predicate comparison above:
+     if the issue rate of the SVE code is limited by predicate operations
+     (i.e. if sve_pred_issue_cycles_per_iter > sve_nonpred_cycles_per_iter),
+     and if the Advanced SIMD code could issue within the limit imposed
+     by the predicate operations, the predicate operations are adding an
+     overhead that the original code didn't have and so we should prefer
+     the Advanced SIMD version.  However, if the predicate operations
+     do not dominate in this way, we should only increase the cost of
+     the SVE code if sve_cycles_per_iter is strictly greater than
+     advsimd_cycles_per_iter.  Given rounding effects, this should mean
+     that Advanced SIMD is either better or at least no worse.  */
+  if (sve_nonpred_cycles_per_iter >= sve_pred_issue_cycles_per_iter)
+    sve_estimate = sve_cycles_per_iter;
+  if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate)
+    {
+      /* This ensures that min_cost > orig_body_cost * 2.  */
+      unsigned int factor = fractional_cost::scale (1, sve_estimate,
+						    advsimd_cycles_per_iter);
+      unsigned int min_cost = orig_body_cost * factor + 1;
+      if (*body_cost < min_cost)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			     "Increasing body cost to %d because Advanced"
+			     " SIMD code could issue as quickly\n",
+			     min_cost);
+	  *body_cost = min_cost;
+	  *should_disparage = true;
+	}
+    }
+
+  return sve_cycles_per_iter;
+}
+
 /* BODY_COST is the cost of a vector loop body recorded in COSTS.
    Adjust the cost as necessary and return the new cost.  */
 static unsigned int
@@ -15583,101 +15703,15 @@  aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
 
   if ((costs->vec_flags & VEC_ANY_SVE) && issue_info->sve)
     {
-      /* Estimate the minimum number of cycles per iteration needed to issue
-	 non-predicate operations.  */
-      fractional_cost sve_cycles_per_iter
-	= aarch64_estimate_min_cycles_per_iter (&costs->sve_ops,
-						issue_info->sve);
-
-      /* Separately estimate the minimum number of cycles per iteration needed
-	 to issue the predicate operations.  */
-      fractional_cost pred_cycles_per_iter
-	= { costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle };
-
       if (dump_enabled_p ())
-	{
-	  dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
-	  costs->sve_ops.dump ();
-	  dump_printf_loc (MSG_NOTE, vect_location,
-			   "  estimated cycles per iteration for non-predicate"
-			   " operations = %f\n",
-			   sve_cycles_per_iter.as_double ());
-	  if (costs->sve_ops.pred_ops)
-	    dump_printf_loc (MSG_NOTE, vect_location, "  estimated cycles per"
-			     " iteration for predicate operations = %d\n",
-			     pred_cycles_per_iter.as_double ());
-	}
-
-      vector_cycles_per_iter = std::max (sve_cycles_per_iter,
-					 pred_cycles_per_iter);
+	dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
       vector_reduction_latency = costs->sve_ops.reduction_latency;
-
-      /* If the scalar version of the loop could issue at least as
-	 quickly as the predicate parts of the SVE loop, make the SVE loop
-	 prohibitively expensive.  In this case vectorization is adding an
-	 overhead that the original scalar code didn't have.
-
-	 This is mostly intended to detect cases in which WHILELOs dominate
-	 for very tight loops, which is something that normal latency-based
-	 costs would not model.  Adding this kind of cliffedge would be
-	 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
-	 code later in the function handles that case in a more
-	 conservative way.  */
-      fractional_cost sve_estimate = pred_cycles_per_iter + 1;
-      if (scalar_cycles_per_iter < sve_estimate)
-	{
-	  unsigned int min_cost
-	    = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
-	  if (body_cost < min_cost)
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_NOTE, vect_location,
-				 "Increasing body cost to %d because the"
-				 " scalar code could issue within the limit"
-				 " imposed by predicate operations\n",
-				 min_cost);
-	      body_cost = min_cost;
-	      should_disparage = true;
-	    }
-	}
-
-      /* If it appears that the Advanced SIMD version of a loop could issue
-	 more quickly than the SVE one, increase the SVE cost in proportion
-	 to the difference.  The intention is to make Advanced SIMD preferable
-	 in cases where an Advanced SIMD version exists, without increasing
-	 the costs so much that SVE won't be used at all.
-
-	 The reasoning is similar to the scalar vs. predicate comparison above:
-	 if the issue rate of the SVE code is limited by predicate operations
-	 (i.e. if pred_cycles_per_iter > sve_cycles_per_iter), and if the
-	 Advanced SIMD code could issue within the limit imposed by the
-	 predicate operations, the predicate operations are adding an
-	 overhead that the original code didn't have and so we should prefer
-	 the Advanced SIMD version.  However, if the predicate operations
-	 do not dominate in this way, we should only increase the cost of
-	 the SVE code if sve_cycles_per_iter is strictly greater than
-	 advsimd_cycles_per_iter.  Given rounding effects, this should mean
-	 that Advanced SIMD is either better or at least no worse.  */
-      if (sve_cycles_per_iter >= pred_cycles_per_iter)
-	sve_estimate = sve_cycles_per_iter;
-      if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate)
-	{
-	  /* This ensures that min_cost > orig_body_cost * 2.  */
-	  unsigned int factor
-	    = fractional_cost::scale (1, sve_estimate,
-				      advsimd_cycles_per_iter);
-	  unsigned int min_cost = orig_body_cost * factor + 1;
-	  if (body_cost < min_cost)
-	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_NOTE, vect_location,
-				 "Increasing body cost to %d because Advanced"
-				 " SIMD code could issue as quickly\n",
-				 min_cost);
-	      body_cost = min_cost;
-	      should_disparage = true;
-	    }
-	}
+      vector_cycles_per_iter
+	= aarch64_adjust_body_cost_sve (costs, issue_info,
+					scalar_cycles_per_iter,
+					advsimd_cycles_per_iter,
+					could_use_advsimd, orig_body_cost,
+					&body_cost, &should_disparage);
     }
 
   /* Decide whether to stick to latency-based costs or whether to try to