===================================================================
@@ -264,6 +264,7 @@ static reg_class_t arm_preferred_rename_
static unsigned int arm_autovectorize_vector_sizes (void);
static int arm_default_branch_cost (bool, bool);
static int arm_cortex_a5_branch_cost (bool, bool);
+static int arm_cortex_m_branch_cost (bool, bool);
static bool arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
const unsigned char *sel);
@@ -948,6 +949,24 @@ const struct tune_params arm_cortex_tune
arm_default_branch_cost
};
+/* Cortex-M tuning. On Cortex-M4 cores for example, MOVW/MOVT take a single
+ cycle to execute each. An LDR from the constant pool also takes two cycles
+ to execute, but mildly increases pipelining opportunity (consecutive
+ loads/stores can be pipelined together, saving one cycle), and may also
+ improve icache utilisation. Hence we prefer the constant pool for such
+ processors. */
+
+const struct tune_params arm_cortex_m_tune =
+{
+ arm_9e_rtx_costs,
+ NULL,
+ 1, /* Constant limit. */
+ 5, /* Max cond insns. */
+ ARM_PREFETCH_NOT_BENEFICIAL,
+ true, /* Prefer constant pool. */
+ arm_cortex_m_branch_cost
+};
+
/* Branches can be dual-issued on Cortex-A5, so conditional execution is
less appealing. Set max_insns_skipped to a low value. */
@@ -8631,6 +8650,20 @@ arm_cortex_a5_branch_cost (bool speed_p,
return speed_p ? 0 : arm_default_branch_cost (speed_p, predictable_p);
}
+/* Thumb-2 branches are relatively cheap on Cortex-M processors ("1 + P cycles"
+ on Cortex-M4, where P varies from 1 to 3 according to some criteria), since
+ sequences of non-executed instructions in IT blocks probably take the same
+ amount of time as executed instructions (and the IT instruction itself takes
+ space in icache). This function was experimentally determined to give good
+ results on a popular embedded benchmark. */
+
+static int
+arm_cortex_m_branch_cost (bool speed_p, bool predictable_p)
+{
+ return (TARGET_32BIT && speed_p) ? 1
+ : arm_default_branch_cost (speed_p, predictable_p);
+}
+
static bool fp_consts_inited = false;
static REAL_VALUE_TYPE value_fp0;
===================================================================
@@ -133,8 +133,8 @@ ARM_CORE("cortex-a15", cortexa15, 7A,
ARM_CORE("cortex-r4", cortexr4, 7R, FL_LDSCHED, cortex)
ARM_CORE("cortex-r4f", cortexr4f, 7R, FL_LDSCHED, cortex)
ARM_CORE("cortex-r5", cortexr5, 7R, FL_LDSCHED | FL_ARM_DIV, cortex)
-ARM_CORE("cortex-m4", cortexm4, 7EM, FL_LDSCHED, cortex)
-ARM_CORE("cortex-m3", cortexm3, 7M, FL_LDSCHED, cortex)
-ARM_CORE("cortex-m1", cortexm1, 6M, FL_LDSCHED, cortex)
-ARM_CORE("cortex-m0", cortexm0, 6M, FL_LDSCHED, cortex)
-ARM_CORE("cortex-m0plus", cortexm0plus, 6M, FL_LDSCHED, cortex)
+ARM_CORE("cortex-m4", cortexm4, 7EM, FL_LDSCHED, cortex_m)
+ARM_CORE("cortex-m3", cortexm3, 7M, FL_LDSCHED, cortex_m)
+ARM_CORE("cortex-m1", cortexm1, 6M, FL_LDSCHED, cortex_m)
+ARM_CORE("cortex-m0", cortexm0, 6M, FL_LDSCHED, cortex_m)
+ARM_CORE("cortex-m0plus", cortexm0plus, 6M, FL_LDSCHED, cortex_m)
Hi, This patch provides some mild performance tuning for Cortex-M series processors -- originally measured on a Cortex-M4 device (circa March 2011, on a 4.5-based branch), but making the changes for all Cortex-M devices -- using the possibly-unwarranted assumption that other Cortex-M devices are likely to be similar at a microarchitectural level. Whether that's true or not, I suspect Cortex-M cores are more similar to each other than to "generic" Cortex-R or Cortex-A cores, though currently they will be tuned for in the same way as those. We've used an analogous patch in (some of) our toolchains for a while, with no ill effects, though I'm not set up to test mainline on Cortex-M cores at the moment, nor to re-benchmark the patch. The compiler builds, and spot-checks look fine though. So, OK to apply, or does this need further testing/benchmarking? Maybe someone within ARM has a suitable way of doing the latter easily, if required? Thanks, Julian ChangeLog gcc/ * config/arm/arm.c (arm_cortex_m-branch_cost): New. (arm_cortex_m_tune): New. * config/arm/arm-cores.def (cortex-m4, cortex-m3, cortex-m1) (cortex-m0, cortex-m0plus): Use Cortex-M-specific tuning.