diff mbox

[1/4,AArch64] Add scheduling and cost models for Exynos M1

Message ID 563A9234.1000301@samsung.com
State New
Headers show

Commit Message

Evandro Menezes Nov. 4, 2015, 11:18 p.m. UTC
This patch adds extra tuning information about AArch64 targets:

  * Maximum number of case values before resorting to a jump table
    The default values assumed independently of the specific backends
    may be rather low for modern processors, which sport quite efficient
    direct branch prediction, whereas indirect branch prediction is
    still typically not so efficient.  This value may be specifically
    set for a processor or left at zero to use the default values.
  * L1 cache line size
    The auto-prefetcher uses this information when emitting software
    prefetch insns.

Please, commit if it's alright.

Thank you,
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 0ab1ca8..66be417 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -43,7 +43,7 @@ 
 AARCH64_CORE("cortex-a53",  cortexa53, cortexa53, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa53, "0x41", "0xd03")
 AARCH64_CORE("cortex-a57",  cortexa57, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa57, "0x41", "0xd07")
 AARCH64_CORE("cortex-a72",  cortexa72, cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC, cortexa72, "0x41", "0xd08")
-AARCH64_CORE("exynos-m1",   exynosm1,  cortexa57, 8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, cortexa72, "0x53", "0x001")
+AARCH64_CORE("exynos-m1",   exynosm1,  exynosm1,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, exynosm1, "0x53", "0x001")
 AARCH64_CORE("thunderx",    thunderx,  thunderx,  8A,  AARCH64_FL_FOR_ARCH8 | AARCH64_FL_CRC | AARCH64_FL_CRYPTO, thunderx,  "0x43", "0x0a1")
 AARCH64_CORE("xgene1",      xgene1,    xgene1,    8A,  AARCH64_FL_FOR_ARCH8, xgene1, "0x50", "0x000")
 
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 81792bc..ecf4685 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -195,6 +195,9 @@  struct tune_params
   int vec_reassoc_width;
   int min_div_recip_mul_sf;
   int min_div_recip_mul_df;
+  int max_case_values; /* Case values threshold; or 0 for the default.  */
+
+  int cache_line_size; /* Cache line size; or 0 for the default.  */
 
 /* An enum specifying how to take into account CPU autoprefetch capabilities
    during instruction scheduling:
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5c8604f..e7f1c07 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -355,6 +355,8 @@  static const struct tune_params generic_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -378,6 +380,8 @@  static const struct tune_params cortexa53_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -401,6 +405,8 @@  static const struct tune_params cortexa57_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)	/* tune_flags.  */
 };
@@ -424,6 +430,8 @@  static const struct tune_params cortexa72_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -446,6 +454,8 @@  static const struct tune_params thunderx_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -468,6 +478,8 @@  static const struct tune_params xgene1_tunings =
   1,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
   2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
@@ -3242,6 +3254,20 @@  aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
   return aarch64_tls_referenced_p (x);
 }
 
+/* Implement TARGET_CASE_VALUES_THRESHOLD.  */
+
+static unsigned int
+aarch64_case_values_threshold (void)
+{
+  /* Use the specified limit for the number of cases before using jump
+     tables at higher optimization levels.  */
+  if (optimize > 2
+      && selected_cpu->tune->max_case_values != 0)
+    return selected_cpu->tune->max_case_values;
+  else
+    return default_case_values_threshold ();
+}
+
 /* Return true if register REGNO is a valid index register.
    STRICT_P is true if REG_OK_STRICT is in effect.  */
 
@@ -7672,6 +7698,13 @@  aarch64_override_options_internal (struct gcc_options *opts)
 			 opts->x_param_values,
 			 global_options_set.x_param_values);
 
+  /* Set the L1 cache line size.  */
+  if (selected_cpu->tune->cache_line_size != 0)
+    maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+			   selected_cpu->tune->cache_line_size,
+			   opts->x_param_values,
+			   global_options_set.x_param_values);
+
   aarch64_override_options_after_change_1 (opts);
 }
 
@@ -13385,6 +13418,7 @@  aarch64_promoted_type (const_tree t)
     return float_type_node;
   return NULL_TREE;
 }
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -13432,6 +13466,9 @@  aarch64_promoted_type (const_tree t)
 #undef TARGET_CANNOT_FORCE_CONST_MEM
 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
 
+#undef TARGET_CASE_VALUES_THRESHOLD
+#define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
+
 #undef TARGET_CONDITIONAL_REGISTER_USAGE
 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 9c4eb52..d526b52 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -202,6 +202,7 @@ 
 ;; Scheduling
 (include "../arm/cortex-a53.md")
 (include "../arm/cortex-a57.md")
+(include "../arm/exynos-m1.md")
 (include "thunderx.md")
 (include "../arm/xgene1.md")
 
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 66e09a8..850bde0 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -331,6 +331,109 @@  const struct cpu_cost_table cortexa57_extra_costs =
   }
 };
 
+const struct cpu_cost_table exynosm1_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    COSTS_N_INSNS (0), /* shift_reg.  */
+    0,                 /* arith_shift.  */
+    COSTS_N_INSNS (1), /* arith_shift_reg.  */
+    0,                 /* log_shift.  */
+    COSTS_N_INSNS (1), /* log_shift_reg.  */
+    0,                 /* extend.  */
+    COSTS_N_INSNS (1), /* extend_arith.  */
+    0,                 /* bfi.  */
+    0,                 /* bfx.  */
+    0,                 /* clz.  */
+    0,                 /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (2),       /* simple.  */
+      COSTS_N_INSNS (3),       /* flag_setting.  */
+      COSTS_N_INSNS (4),       /* extend.  */
+      COSTS_N_INSNS (2),       /* add.  */
+      COSTS_N_INSNS (4),       /* extend_add.  */
+      COSTS_N_INSNS (19)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (4),       /* extend.  */
+      COSTS_N_INSNS (3),       /* add.  */
+      COSTS_N_INSNS (4),       /* extend_add.  */
+      COSTS_N_INSNS (35)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (3),         /* load.  */
+    COSTS_N_INSNS (4),         /* load_sign_extend.  */
+    COSTS_N_INSNS (3),         /* ldrd.  */
+    COSTS_N_INSNS (2),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    2,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (4),         /* loadf.  */
+    COSTS_N_INSNS (4),         /* loadd.  */
+    COSTS_N_INSNS (4),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    2,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    0,                         /* store_unaligned.  */
+    COSTS_N_INSNS (1),         /* loadv.  */
+    COSTS_N_INSNS (1)          /* storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (21),      /* div.  */
+      COSTS_N_INSNS (3),       /* mult.  */
+      COSTS_N_INSNS (4),       /* mult_addsub.  */
+      COSTS_N_INSNS (4),       /* fma.  */
+      COSTS_N_INSNS (2),       /* addsub.  */
+      COSTS_N_INSNS (0),       /* fpconst.  */
+      COSTS_N_INSNS (0),       /* neg.  */
+      COSTS_N_INSNS (3),       /* compare.  */
+      COSTS_N_INSNS (2),       /* widen.  */
+      COSTS_N_INSNS (2),       /* narrow.  */
+      COSTS_N_INSNS (12),      /* toint.  */
+      COSTS_N_INSNS (7),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (34),      /* div.  */
+      COSTS_N_INSNS (3),       /* mult.  */
+      COSTS_N_INSNS (4),       /* mult_addsub.  */
+      COSTS_N_INSNS (4),       /* fma.  */
+      COSTS_N_INSNS (2),       /* addsub.  */
+      COSTS_N_INSNS (0),       /* fpconst.  */
+      COSTS_N_INSNS (0),       /* neg.  */
+      COSTS_N_INSNS (3),       /* compare.  */
+      COSTS_N_INSNS (2),       /* widen.  */
+      COSTS_N_INSNS (2),       /* narrow.  */
+      COSTS_N_INSNS (12),      /* toint.  */
+      COSTS_N_INSNS (7),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (0)  /* alu.  */
+  }
+};
+
 const struct cpu_cost_table xgene1_extra_costs =
 {
   /* ALU */
diff --git a/gcc/config/arm/arm-cores.def b/gcc/config/arm/arm-cores.def
index 4c35200..18936f0 100644
--- a/gcc/config/arm/arm-cores.def
+++ b/gcc/config/arm/arm-cores.def
@@ -168,7 +168,7 @@  ARM_CORE("cortex-a17.cortex-a7", cortexa17cortexa7, cortexa7,	7A,	ARM_FSET_MAKE_
 ARM_CORE("cortex-a53",	cortexa53, cortexa53,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a53)
 ARM_CORE("cortex-a57",	cortexa57, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
 ARM_CORE("cortex-a72",	cortexa72, cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
-ARM_CORE("exynos-m1",	exynosm1,  cortexa57,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), cortex_a57)
+ARM_CORE("exynos-m1",	exynosm1,  exynosm1,	8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_CRC32 | FL_FOR_ARCH8A), exynosm1)
 ARM_CORE("xgene1",      xgene1,    xgene1,      8A,	ARM_FSET_MAKE_CPU1 (FL_LDSCHED | FL_FOR_ARCH8A),            xgene1)
 
 /* V8 big.LITTLE implementations */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 4310638..6d5a64e 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -1991,6 +1991,29 @@  const struct tune_params arm_cortex_a57_tune =
   tune_params::SCHED_AUTOPREF_FULL
 };
 
+const struct tune_params arm_exynos_m1_tune =
+{
+  arm_9e_rtx_costs,
+  &exynosm1_extra_costs,
+  NULL,						/* Sched adj cost.  */
+  arm_default_branch_cost,
+  &arm_default_vec_cost,
+  1,						/* Constant limit.  */
+  2,						/* Max cond insns.  */
+  8,						/* Memset max inline.  */
+  3,						/* Issue rate.  */
+  ARM_PREFETCH_NOT_BENEFICIAL,
+  tune_params::PREF_CONST_POOL_FALSE,
+  tune_params::PREF_LDRD_TRUE,
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_FALSE,	/* Thumb.  */
+  tune_params::LOG_OP_NON_SHORT_CIRCUIT_FALSE,	/* ARM.  */
+  tune_params::DISPARAGE_FLAGS_ALL,
+  tune_params::PREF_NEON_64_FALSE,
+  tune_params::PREF_NEON_STRINGOPS_TRUE,
+  tune_params::FUSE_NOTHING,
+  tune_params::SCHED_AUTOPREF_OFF
+};
+
 const struct tune_params arm_xgene1_tune =
 {
   arm_9e_rtx_costs,
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 02e147e..e6f07e9 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -377,7 +377,7 @@ 
                                 arm1136jfs,cortexa5,cortexa7,cortexa8,\
                                 cortexa9,cortexa12,cortexa15,cortexa17,\
                                 cortexa53,cortexa57,cortexm4,cortexm7,\
-				marvell_pj4,xgene1")
+				exynosm1,marvell_pj4,xgene1")
 	       (eq_attr "tune_cortexr4" "yes"))
           (const_string "no")
           (const_string "yes"))))
@@ -416,6 +416,7 @@ 
 (include "cortex-m7.md")
 (include "cortex-m4.md")
 (include "cortex-m4-fpu.md")
+(include "exynos-m1.md")
 (include "vfp11.md")
 (include "marvell-pj4.md")
 (include "xgene1.md")
diff --git a/gcc/config/arm/exynos-m1.md b/gcc/config/arm/exynos-m1.md
new file mode 100644
index 0000000..a4f1138
--- /dev/null
+++ b/gcc/config/arm/exynos-m1.md
@@ -0,0 +1,974 @@ 
+;; Samsung Exynos M1 pipeline description
+;; Copyright (C) 2014-2015 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_attr "exynos_m1_type"
+  "mla, mlal,
+   unknown"
+  (cond [
+	  (eq_attr "type" "mla, mlas, smlad, smladx,\
+			   smlawx, smlawy, smlaxy,\
+			   smlsd, smlsdx")
+	    (const_string "mla")
+
+	  (eq_attr "type" "smlal, smlals, smlald,\
+			   smlalxy, smlsld,\
+			   umaal, umlal, umlals")
+	    (const_string "mlal")]
+
+	  (const_string "unknown")))
+
+(define_attr "exynos_m1_neon_type"
+  "neon_arith_simple, neon_arith_basic, neon_arith_complex,
+   neon_multiply, neon_mla, neon_mla_q, neon_mla_long, neon_sat_mla_long,
+   neon_shift_acc, neon_shift_imm_basic, neon_shift_imm_complex,
+   neon_shift_reg_basic, neon_shift_reg_basic_q,
+   neon_shift_reg_complex, neon_shift_reg_complex_q,
+   neon_fp_unary, neon_fp_add, neon_fp_abd, neon_fp_compare,
+   neon_fp_reduc_minmax, neon_fp_reduc_add, neon_fp_round, neon_fp_cvt,
+   neon_fp_minmax, neon_fp_mul, neon_fp_mul_q, neon_fp_mla, neon_fp_mla_q,
+   neon_fp_estimate, neon_fp_estimatex, neon_fp_step,
+   neon_bitops, neon_bitops_q, neon_bitins,
+   neon_to_gp, neon_from_gp, neon_move, neon_tbl,
+   neon_load1_1, neon_load1_2, neon_load1_3, neon_load1_4,
+   neon_load1_one, neon_load1_all,
+   neon_load2_2, neon_load2_one, neon_load2_all,
+   neon_load3_3, neon_load3_one, neon_load3_all,
+   neon_load4_4, neon_load4_one, neon_load4_all,
+   neon_store,
+   neon_store1_1, neon_store1_2, neon_store1_3, neon_store1_4, neon_store1_one,
+   neon_store2_2, neon_store2_one,
+   neon_store3_3, neon_store3_one,
+   neon_store4_4, neon_store4_one,
+   unknown"
+  (cond [
+	  (eq_attr "type" "neon_abd, neon_abd_q, neon_abd_long,\
+			   neon_abs, neon_abs_q,\
+			   neon_minmax, neon_minmax_q")
+	    (const_string "neon_arith_simple")
+
+	  (eq_attr "type" "neon_add, neon_add_q, neon_add_long,\
+			   neon_neg, neon_neg_q,\
+			   neon_sub, neon_sub_q, neon_sub_long, neon_sub_widen,\
+			   neon_logic, neon_logic_q, neon_tst, neon_tst_q,\
+			   neon_compare_zero, neon_compare_zero_q")
+	    (const_string "neon_arith_basic")
+
+	  (eq_attr "type" "neon_add_widen, neon_arith_acc, neon_arith_acc_q,\
+			   neon_reduc_add, neon_reduc_add_q,\
+			   neon_reduc_add_acc, neon_reduc_add_acc_q,\
+			   neon_reduc_add_long, neon_add_halve_narrow_q,\
+			   neon_add_halve, neon_add_halve_q,\
+			   neon_sub_halve, neon_sub_halve_q, neon_qabs,\
+			   neon_qabs_q, neon_qadd, neon_qadd_q, neon_qneg,\
+			   neon_qneg_q, neon_qsub, neon_qsub_q,\
+			   neon_sub_halve_narrow_q,\
+			   neon_compare, neon_compare_q,\
+			   neon_reduc_minmax, neon_reduc_minmax_q")
+	    (const_string "neon_arith_complex")
+
+	  (eq_attr "type" "neon_mul_b, neon_mul_b_q, neon_mul_h, neon_mul_h_q,\
+			   neon_mul_s, neon_mul_s_q,\
+			   neon_mul_h_scalar, neon_mul_h_scalar_q,\
+			   neon_mul_s_scalar, neon_mul_s_scalar_q,\
+			   neon_mul_h_scalar_long, neon_mul_s_scalar_long,\
+			   neon_sat_mul_b, neon_sat_mul_b_q,\
+			   neon_sat_mul_h, neon_sat_mul_h_q,\
+			   neon_sat_mul_s, neon_sat_mul_s_q,\
+			   neon_sat_mul_h_scalar, neon_sat_mul_h_scalar_q,\
+			   neon_sat_mul_s_scalar, neon_sat_mul_s_scalar_q,\
+			   neon_sat_mul_b_long, neon_sat_mul_h_long,\
+			   neon_sat_mul_s_long, neon_sat_mul_h_scalar_long,\
+			   neon_sat_mul_s_scalar_long")
+	    (const_string "neon_multiply")
+
+	  (eq_attr "type" "neon_mla_b, neon_mla_h, neon_mla_s,\
+			   neon_mla_h_scalar, neon_mla_s_scalar,\
+			   neon_mla_b_long, neon_mla_h_long,\
+			   neon_mla_s_long,\
+			   neon_mla_h_scalar_long, neon_mla_s_scalar_long,\
+			   neon_mla_b_q, neon_mla_h_q, neon_mla_s_q,\
+			   neon_mla_h_scalar_q, neon_mla_s_scalar_q")
+	    (const_string "neon_mla")
+
+	  (eq_attr "type" "neon_sat_mla_b_long, neon_sat_mla_h_long,\
+			   neon_sat_mla_s_long, neon_sat_mla_h_scalar_long,\
+			   neon_sat_mla_s_scalar_long")
+	    (const_string "neon_sat_mla_long")
+
+	  (eq_attr "type" "neon_shift_acc, neon_shift_acc_q")
+	    (const_string "neon_shift_acc")
+
+	  (eq_attr "type" "neon_shift_imm, neon_shift_imm_q,\
+			   neon_shift_imm_narrow_q, neon_shift_imm_long")
+	    (const_string "neon_shift_imm_basic")
+
+	  (eq_attr "type" "neon_sat_shift_imm, neon_sat_shift_imm_q,\
+			   neon_sat_shift_imm_narrow_q")
+	    (const_string "neon_shift_imm_complex")
+
+	  (eq_attr "type" "neon_shift_reg, neon_shift_reg_q")
+	    (const_string "neon_shift_reg_basic")
+
+	  (eq_attr "type" "neon_sat_shift_reg, neon_sat_shift_reg_q")
+	    (const_string "neon_shift_reg_complex")
+
+	  (eq_attr "type" "neon_fp_neg_s, neon_fp_neg_s_q,\
+			   neon_fp_abs_s, neon_fp_abs_s_q,\
+			   neon_fp_neg_d, neon_fp_neg_d_q,\
+			   neon_fp_abs_d, neon_fp_abs_d_q")
+	    (const_string "neon_fp_unary")
+
+	  (eq_attr "type" "neon_fp_addsub_s, neon_fp_addsub_s_q,\
+			   neon_fp_addsub_d, neon_fp_addsub_d_q")
+	    (const_string "neon_fp_add")
+
+	  (eq_attr "type" "neon_fp_abd_s, neon_fp_abd_s_q,\
+			   neon_fp_abd_d, neon_fp_abd_d_q")
+	    (const_string "neon_fp_abd")
+
+	  (eq_attr "type" "neon_fp_compare_s, neon_fp_compare_s_q,\
+			   neon_fp_compare_d, neon_fp_compare_d_q,\
+			   neon_fp_minmax_s, neon_fp_minmax_s_q,\
+			   neon_fp_minmax_d, neon_fp_minmax_d_q")
+	    (const_string "neon_fp_compare")
+
+	  (eq_attr "type" "neon_fp_reduc_minmax_s, neon_fp_reduc_minmax_s_q,\
+			   neon_fp_reduc_minmax_d, neon_fp_reduc_minmax_d_q")
+	    (const_string "neon_fp_reduc_minmax")
+
+	  (eq_attr "type" "neon_fp_reduc_add_s, neon_fp_reduc_add_s_q,\
+			   neon_fp_reduc_add_d, neon_fp_reduc_add_d_q")
+	    (const_string "neon_fp_reduc_add")
+
+	  (eq_attr "type" "neon_fp_round_s, neon_fp_round_s_q,\
+			   neon_fp_round_d, neon_fp_round_d_q")
+	    (const_string "neon_fp_round")
+
+	  (eq_attr "type" "neon_fp_cvt_narrow_s_q, neon_fp_cvt_widen_h,
+			   neon_fp_to_int_s, neon_fp_to_int_s_q,\
+			   neon_fp_to_int_d_q, neon_fp_to_int_d,\
+			   neon_int_to_fp_s, neon_int_to_fp_s_q,\
+			   neon_int_to_fp_d, neon_int_to_fp_d_q")
+	    (const_string "neon_fp_cvt")
+
+	  (eq_attr "type" "neon_fp_mul_s, neon_fp_mul_s_q,\
+			   neon_fp_mul_s_scalar, neon_fp_mul_s_scalar_q,\
+			   neon_fp_mul_d, neon_fp_mul_d_q,\
+			   neon_fp_mul_d_scalar_q")
+	    (const_string "neon_fp_mul")
+
+	  (eq_attr "type" "neon_fp_mla_s, neon_fp_mla_s_q,\
+			   neon_fp_mla_s_scalar,neon_fp_mla_s_scalar_q,\
+			   neon_fp_mla_d, neon_fp_mla_d_q,\
+			   neon_fp_mla_d_scalar_q")
+	    (const_string "neon_fp_mla")
+
+	  (eq_attr "type" "neon_fp_recpe_s, neon_fp_recpe_s_q,\
+			   neon_fp_rsqrte_s, neon_fp_rsqrte_s_q,\
+			   neon_fp_recpe_d, neon_fp_recpe_d_q,\
+			   neon_fp_rsqrte_d, neon_fp_rsqrte_d_q")
+	    (const_string "neon_fp_estimate")
+
+	  (eq_attr "type" "neon_fp_recpx_s, neon_fp_recpx_s_q,\
+			   neon_fp_recpx_d, neon_fp_recpx_d_q")
+	    (const_string "neon_fp_estimatex")
+
+	  (eq_attr "type" "neon_fp_recps_s, neon_fp_recps_s_q,\
+			   neon_fp_rsqrts_s, neon_fp_rsqrts_s_q,\
+			   neon_fp_recps_d, neon_fp_recps_d_q,\
+			   neon_fp_rsqrts_d, neon_fp_rsqrts_d_q")
+	    (const_string "neon_fp_step")
+
+	  (eq_attr "type" "neon_rbit, neon_rbit_q,\
+			   neon_cls, neon_cls_q, neon_cnt, neon_cnt_q,\
+			   neon_dup, neon_dup_q,\
+			   neon_rev, neon_rev_q,\
+			   neon_move, neon_move_q,
+			   neon_ext, neon_permute, neon_zip")
+	    (const_string "neon_bitops")
+
+	  (eq_attr "type" "neon_ext_q, neon_permute_q, neon_zip_q")
+	    (const_string "neon_bitops_q")
+
+	  (eq_attr "type" "neon_bsl, neon_bsl_q")
+	    (const_string "neon_bitins")
+
+	  (eq_attr "type" "neon_tbl1, neon_tbl2, neon_tbl3, neon_tbl4")
+	    (const_string "neon_tbl")
+
+	  (eq_attr "type" "neon_from_gp, neon_from_gp_q, f_mcr, f_mcrr")
+	    (const_string "neon_from_gp")
+
+	  (eq_attr "type" "neon_to_gp, neon_to_gp_q, f_mrc, f_mrrc")
+	    (const_string "neon_to_gp")
+
+	  (eq_attr "type" "neon_load1_1reg, neon_load1_1reg_q")
+	    (const_string "neon_load1_1")
+
+	  (eq_attr "type" "neon_load1_2reg, neon_load1_2reg_q")
+	    (const_string "neon_load1_2")
+
+	  (eq_attr "type" "neon_load1_3reg, neon_load1_3reg_q")
+	    (const_string "neon_load1_3")
+
+	  (eq_attr "type" "neon_load1_4reg, neon_load1_4reg_q")
+	    (const_string "neon_load1_4")
+
+	  (eq_attr "type" "neon_load1_one_lane, neon_load1_one_lane_q")
+	    (const_string "neon_load1_one")
+
+	  (eq_attr "type" "neon_load1_all_lanes, neon_load1_all_lanes_q")
+	    (const_string "neon_load1_all")
+
+	  (eq_attr "type" "neon_load2_2reg, neon_load2_2reg_q,\
+			   neon_load2_4reg, neon_load2_4reg_q")
+	    (const_string "neon_load2_2")
+
+	  (eq_attr "type" "neon_load2_one_lane, neon_load2_one_lane_q")
+	    (const_string "neon_load2_one")
+
+	  (eq_attr "type" "neon_load2_all_lanes, neon_load2_all_lanes_q")
+	    (const_string "neon_load2_all")
+
+	  (eq_attr "type" "neon_load3_3reg, neon_load3_3reg_q")
+	    (const_string "neon_load3_3")
+
+	  (eq_attr "type" "neon_load3_one_lane, neon_load3_one_lane_q")
+	    (const_string "neon_load3_one")
+
+	  (eq_attr "type" "neon_load3_all_lanes, neon_load3_all_lanes_q")
+	    (const_string "neon_load3_all")
+
+	  (eq_attr "type" "neon_load4_4reg, neon_load4_4reg_q")
+	    (const_string "neon_load4_4")
+
+	  (eq_attr "type" "neon_load4_one_lane, neon_load4_one_lane_q")
+	    (const_string "neon_load4_one")
+
+	  (eq_attr "type" "neon_load4_all_lanes, neon_load4_all_lanes_q")
+	    (const_string "neon_load4_all")
+
+	  (eq_attr "type" "f_stores, f_stored,\
+			   neon_stp, neon_stp_q")
+	    (const_string "neon_store")
+
+	  (eq_attr "type" "neon_store1_1reg, neon_store1_1reg_q")
+	    (const_string "neon_store1_1")
+
+	  (eq_attr "type" "neon_store1_2reg, neon_store1_2reg_q")
+	    (const_string "neon_store1_2")
+
+	  (eq_attr "type" "neon_store1_3reg, neon_store1_3reg_q")
+	    (const_string "neon_store1_3")
+
+	  (eq_attr "type" "neon_store1_4reg, neon_store1_4reg_q")
+	    (const_string "neon_store1_4")
+
+	  (eq_attr "type" "neon_store1_one_lane, neon_store1_one_lane_q")
+	    (const_string "neon_store1_one")
+
+	  (eq_attr "type" "neon_store2_2reg, neon_store2_2reg_q,\
+			   neon_store2_4reg, neon_store2_4reg_q")
+	    (const_string "neon_store2_2")
+
+	  (eq_attr "type" "neon_store2_one_lane, neon_store2_one_lane_q")
+	    (const_string "neon_store2_one")
+
+	  (eq_attr "type" "neon_store3_3reg, neon_store3_3reg_q")
+	    (const_string "neon_store3_3")
+
+	  (eq_attr "type" "neon_store3_one_lane, neon_store3_one_lane_q")
+	    (const_string "neon_store3_one")
+
+	  (eq_attr "type" "neon_store4_4reg, neon_store4_4reg_q")
+	    (const_string "neon_store4_4")
+
+	  (eq_attr "type" "neon_store4_one_lane, neon_store4_one_lane_q")
+	    (const_string "neon_store4_one")]
+
+	  (const_string "unknown")))
+
+;; Redefine this attribute for when building the AArch64 backend.
+(define_attr "predicated" "yes,no" (const_string "no"))
+
+;; The Exynos M1 core is modeled as a triple issue pipeline that has
+;; the following functional units.
+
+(define_automaton "exynos_m1_gp")
+(define_automaton "exynos_m1_ls")
+(define_automaton "exynos_m1_fp")
+
+;; 1.  Two pipelines for simple integer operations: A, B
+;; 2.  One pipeline for simple or complex integer operations: C
+
+(define_cpu_unit "em1_xa, em1_xb, em1_xc" "exynos_m1_gp")
+
+(define_reservation "em1_alu" "(em1_xa | em1_xb | em1_xc)")
+(define_reservation "em1_c" "em1_xc")
+
+;; 3.  Two asymmetric pipelines for Neon and FP operations: F0, F1
+
+(define_cpu_unit "em1_f0, em1_f1" "exynos_m1_fp")
+
+(define_reservation "em1_fmac" "em1_f0")
+(define_reservation "em1_fcvt" "em1_f0")
+(define_reservation "em1_nalu" "(em1_f0 | em1_f1)")
+(define_reservation "em1_nalu0" "em1_f0")
+(define_reservation "em1_nalu1" "em1_f1")
+(define_reservation "em1_nmisc" "em1_f0")
+(define_reservation "em1_ncrypt" "em1_f0")
+(define_reservation "em1_fadd" "em1_f1")
+(define_reservation "em1_fvar" "em1_f1")
+(define_reservation "em1_fst" "em1_f1")
+
+;; 4.  One pipeline for branch operations: BX
+
+(define_cpu_unit "em1_bx" "exynos_m1_gp")
+
+(define_reservation "em1_br" "em1_bx")
+
+;; 5.  One AGU for loads: L
+;;     One AGU for stores and one pipeline for stores: S, SD
+
+(define_cpu_unit "em1_lx" "exynos_m1_ls")
+(define_cpu_unit "em1_sx, em1_sd" "exynos_m1_ls")
+
+(define_reservation "em1_ld" "em1_lx")
+(define_reservation "em1_st" "(em1_sx + em1_sd)")
+
+;; Common occurrences
+(define_reservation "em1_sfst" "(em1_fst + em1_st)")
+(define_reservation "em1_lfst" "(em1_fst + em1_ld)")
+
+;; Branches
+;;
+;; No latency as there is no result
+;; TODO: Unconditional branches use no units;
+;; conditional branches add the BX unit;
+;; indirect branches add the C unit.
+(define_insn_reservation "exynos_m1_branch" 0
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "branch"))
+  "em1_br")
+
+(define_insn_reservation "exynos_m1_call" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "call"))
+  "em1_alu")
+
+;; Basic ALU
+;;
+;; Simple ALU without shift, non-predicated
+(define_insn_reservation "exynos_m1_alu" 1
+  (and (eq_attr "tune" "exynosm1")
+       (and (not (eq_attr "predicated" "yes"))
+	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
+			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
+			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
+			     adr, bfm, clz, rbit, rev, csel, alu_dsp_reg,\
+			     shift_imm, shift_reg, rotate_imm, extend,\
+			     mov_imm, mov_reg,\
+			     mvn_imm, mvn_reg,\
+			     mrs, multiple")))
+  "em1_alu")
+
+;; Simple ALU without shift, predicated
+(define_insn_reservation "exynos_m1_alu_p" 1
+  (and (eq_attr "tune" "exynosm1")
+       (and (eq_attr "predicated" "yes")
+	    (eq_attr "type" "alu_imm, alus_imm, logic_imm, logics_imm,\
+			     alu_sreg, alus_sreg, logic_reg, logics_reg,\
+			     adc_imm, adcs_imm, adc_reg, adcs_reg,\
+			     adr, bfm, clz, rbit, rev, alu_dsp_reg,\
+			     shift_imm, shift_reg, rotate_imm, extend,\
+			     mov_imm, mov_reg,\
+			     mvn_imm, mvn_reg,\
+			     mrs, multiple")))
+  "em1_c")
+
+;; ALU ops with immediate shift
+;; TODO: if the shift value is between 0 and 3, the latency is just 1 cycle;
+;;       otherwise it takes 2 cycles and the unit is blocked;
+;;       for now, assume the latter's latency and the former's units.
+(define_insn_reservation "exynos_m1_alu_shift" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "alu_ext, alus_ext,\
+			alu_shift_imm, alus_shift_imm,\
+			logic_shift_imm, logics_shift_imm,\
+			mov_shift, mvn_shift"))
+  "(em1_alu)")
+
+;; ALU ops with register controlled shift, non-predicated
+(define_insn_reservation "exynos_m1_alu_shift_reg" 2
+  (and (eq_attr "tune" "exynosm1")
+       (and (not (eq_attr "predicated" "yes"))
+	    (eq_attr "type" "alu_shift_reg, alus_shift_reg,\
+			     logic_shift_reg, logics_shift_reg,\
+			     mov_shift_reg, mvn_shift_reg")))
+   "(em1_alu * 2)")
+
+;; ALU ops with register controlled shift, predicated
+(define_insn_reservation "exynos_m1_alu_shift_reg_p" 2
+  (and (eq_attr "tune" "exynosm1")
+       (and (eq_attr "predicated" "yes")
+	    (eq_attr "type" "alu_shift_reg, alus_shift_reg,\
+			     logic_shift_reg, logics_shift_reg,\
+			     mov_shift_reg, mvn_shift_reg")))
+  "(em1_alu, em1_c)")
+
+;; Integer multiply
+(define_insn_reservation "exynos_m1_mul" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "mul"))
+  "em1_c")
+
+;; Integer multiply-accumulate
+;; TODO: tell 32 from 64-bit ones
+(define_insn_reservation "exynos_m1_mla" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_type" "mla"))
+  "em1_c")
+
+(define_insn_reservation "exynos_m1_mlal" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_type" "mlal"))
+  "em1_alu, em1_c")
+
+;; Integer divide
+;; TODO: assume the median latency; blocks other divisions
+(define_insn_reservation "exynos_m1_div" 13
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "udiv, sdiv"))
+  "em1_c")
+
+;; Load-store execution Unit
+;;
+;; Loads of up to 2 words.
+(define_insn_reservation "exynos_m1_load" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "load_byte, load1, load2"))
+  "em1_ld")
+
+;; Loads of 3 or 4 words.
+(define_insn_reservation "exynos_m1_loadm" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "load3, load4"))
+  "(em1_ld * 3)")
+
+;; Stores of up to 2 words.
+(define_insn_reservation "exynos_m1_store" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "store1, store2"))
+  "em1_st")
+
+;; Stores of 3 or 4 words.
+(define_insn_reservation "exynos_m1_storem" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "store3, store4"))
+  "(em1_st * 3)")
+
+;; Advanced SIMD Unit
+;;
+;; Integer Arithmetic Instructions.
+
+(define_insn_reservation  "exynos_m1_arith_simple" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_simple"))
+  "em1_nmisc")
+
+(define_insn_reservation  "exynos_m1_neon_arith_basic" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_basic"))
+  "em1_nalu")
+
+(define_insn_reservation  "exynos_m1_neon_arith_complex" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_arith_complex"))
+  "em1_nmisc")
+
+;; Integer Multiply Instructions.
+
+(define_insn_reservation "exynos_m1_neon_multiply" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_multiply, neon_mla, neon_sat_mla_long"))
+  "em1_nmisc")
+
+;; Integer Shift Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_acc" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_shift_acc"))
+  "em1_nalu1")
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_basic" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_shift_imm_basic, neon_shift_reg_basic"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_shift_complex" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type"
+		"neon_shift_imm_complex, neon_shift_reg_complex"))
+  "em1_nalu1")
+
+;; Floating Point Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_unary" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_unary"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_add" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_add"))
+  "em1_fadd")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_abd" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_abd"))
+  "em1_nmisc")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_compare" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_compare"))
+  "em1_nmisc")
+
+;; TODO: the latency and throughput of reduce insns actually varies between
+;; 3-5 and 1/4-1, but picked the median values.
+(define_insn_reservation
+  "exynos_m1_neon_fp_reduc" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_reduc_minmax"))
+  "(em1_nmisc * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_reduc_add" 10
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_reduc_add"))
+  "((em1_nalu * 2), em1_fadd)")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_round" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_round"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_cvt" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_cvt"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_mul" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_mul"))
+  "em1_fmac")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_mla" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_mla"))
+  "em1_fmac")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_estimate" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_estimate"))
+  "em1_fcvt")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_estimatex" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_estimatex"))
+  "em1_nmisc")
+
+(define_insn_reservation
+  "exynos_m1_neon_fp_step" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_fp_step"))
+  "em1_fmac")
+
+;; Miscellaneous Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_bitops" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitops"))
+  "em1_nalu")
+
+(define_insn_reservation
+  "exynos_m1_neon_bitops_q" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitops_q"))
+  "(em1_nalu, em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_bitins" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_bitins"))
+  "em1_nalu1")
+
+;; TODO: it is more complicated than this.
+(define_insn_reservation
+  "exynos_m1_neon_tbl" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_tbl"))
+  "em1_nalu1")
+
+(define_insn_reservation
+  "exynos_m1_neon_from_gp" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_from_gp"))
+  "em1_st")
+
+(define_insn_reservation
+  "exynos_m1_neon_to_gp" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_to_gp"))
+  "em1_lfst")
+
+;; Load Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_load" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_loads, f_loadd, neon_ldp"))
+  "em1_ld")
+
+(define_insn_reservation
+  "exynos_m1_neon_load_q" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_ldp_q"))
+  "(em1_ld, em1_ld)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_1" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_1, neon_load1_all"))
+  "em1_ld")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_2" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_2"))
+  "(em1_ld * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_3" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_3"))
+  "(em1_ld * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_4" 8
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_4"))
+  "(em1_ld * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load1_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load1_one"))
+  "((em1_ld * 2), em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_2" 10
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_2"))
+  "(em1_ld * 5)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_one"))
+  "((em1_ld * 2), (em1_nalu * 2))")
+
+(define_insn_reservation
+  "exynos_m1_neon_load2_all" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load2_all"))
+  "(em1_ld * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_3" 12
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_3"))
+  "(em1_ld * 6)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_one" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_one"))
+  "((em1_ld * 4), (em1_nalu * 3))")
+
+(define_insn_reservation
+  "exynos_m1_neon_load3_all" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load3_all"))
+  "(em1_ld * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_4" 14
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_4"))
+  "(em1_ld * 7)")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_one" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_one"))
+  "((em1_ld * 4), (em1_nalu * 4))")
+
+(define_insn_reservation
+  "exynos_m1_neon_load4_all" 8
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_load4_all"))
+  "(em1_ld * 4)")
+
+;; Store Instructions.
+
+(define_insn_reservation
+  "exynos_m1_neon_store" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_1" 1
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_1"))
+  "em1_sfst")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_2" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_2"))
+  "(em1_sfst * 2)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_3" 3
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_3"))
+  "(em1_sfst * 3)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_4" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_4"))
+  "(em1_sfst * 4)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store1_one" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store1_one"))
+  "(em1_fst, em1_st)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store2" 7
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store2_2, neon_store2_one"))
+  "em1_sfst, em1_fst")
+
+(define_insn_reservation
+  "exynos_m1_neon_store3" 16
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store3_3, neon_store3_one"))
+  "((em1_sfst * 3), (em1_fst * 2), em1_nalu)")
+
+(define_insn_reservation
+  "exynos_m1_neon_store4" 17
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "exynos_m1_neon_type" "neon_store4_4, neon_store4_one"))
+  "((em1_sfst * 4), (em1_fst * 2), em1_nalu)")
+
+;; Floating-Point Operations.
+
+(define_insn_reservation "exynos_m1_fp_const" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fconsts, fconstd"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_add" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fadds, faddd"))
+  "em1_fadd")
+
+(define_insn_reservation "exynos_m1_fp_mul" 5
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmuls, fmuld"))
+  "em1_fmac")
+
+(define_insn_reservation "exynos_m1_fp_mac" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmacs, ffmas, fmacd, ffmad"))
+  "em1_fmac")
+
+(define_insn_reservation "exynos_m1_fp_cvt" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvt, f_rints, f_rintd"))
+  "em1_fcvt")
+
+(define_insn_reservation "exynos_m1_fp_cvt_i" 13
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvtf2i"))
+  "(em1_fcvt, em1_lfst)")
+
+(define_insn_reservation "exynos_m1_i_cvt_fp" 9
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_cvti2f"))
+  "(em1_st, em1_fcvt)")
+
+(define_insn_reservation "exynos_m1_fp_cmp" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fcmps, fcmpd"))
+  "em1_nmisc")
+
+(define_insn_reservation "exynos_m1_fp_sel" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fcsel"))
+  "(em1_st + em1_nalu0)")
+
+(define_insn_reservation "exynos_m1_fp_arith" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "ffariths, ffarithd"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_cpy" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fmov"))
+  "em1_nalu")
+
+(define_insn_reservation "exynos_m1_fp_divs" 15
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fdivs, neon_fp_div_s, neon_fp_div_s_q,\
+			fsqrts, neon_fp_sqrt_s, neon_fp_sqrt_s_q"))
+  "(em1_fvar * 9)")
+
+(define_insn_reservation "exynos_m1_fp_divd" 22
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "fdivd, neon_fp_div_d, neon_fp_div_d_q,\
+			fsqrtd, neon_fp_sqrt_d, neon_fp_sqrt_d_q"))
+  "(em1_fvar * 9)")
+
+(define_insn_reservation "exynos_m1_fp_minmax" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "f_minmaxs, f_minmaxd"))
+  "(em1_nmisc * 2)")
+
+;; Crypto Operations.
+
+(define_insn_reservation "exynos_m1_crypto_simple" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crypto_aese, crypto_aesmc,\
+			crypto_sha1_xor, crypto_sha1_fast, crypto_sha256_fast"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_complex" 6
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crypto_sha1_slow, crypto_sha256_slow"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_poly" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_mul_b_long, neon_mul_h_long, neon_mul_s_long"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crypto_polyl" 4
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "neon_mul_d_long"))
+  "em1_ncrypt")
+
+(define_insn_reservation "exynos_m1_crc" 2
+  (and (eq_attr "tune" "exynosm1")
+       (eq_attr "type" "crc"))
+  "em1_c")
+
+;; Simple execution unit bypasses
+
+;; Pre-decrement and post-increment addressing modes update the register quickly.
+;; TODO: figure out how to tell the addressing mode register from the loaded one.
+(define_bypass 1 "exynos_m1_store*" "exynos_m1_store*")
+
+;; MLAs can feed other MLAs quickly.
+(define_bypass 1 "exynos_m1_mla*" "exynos_m1_mla")
+(define_bypass 4 "exynos_m1_mla*" "exynos_m1_mlal")
+
+;; Insns in FMAC or FADD can feed other such insns quickly.
+(define_bypass 4 "exynos_m1_fp_mul"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 5 "exynos_m1_fp_mac"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 4 "exynos_m1_neon_fp_mul"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+(define_bypass 5 "exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+
+(define_bypass 3 "exynos_m1_fp_add"
+		 "exynos_m1_fp_add, exynos_m1_fp_mul, exynos_m1_fp_mac")
+(define_bypass 3 "exynos_m1_neon_fp_add"
+		 "exynos_m1_neon_fp_add, exynos_m1_neon_fp_mul,\
+		  exynos_m1_neon_fp_mla, exynos_m1_neon_fp_step")
+
+;; Insns in NALU can feed other such insns quickly.
+(define_bypass 1 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy"
+		 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy,\
+		  exynos_m1_fp_sel")
+(define_bypass 3 "exynos_m1_fp_sel"
+		 "exynos_m1_fp_const, exynos_m1_fp_arith, exynos_m1_fp_cpy,\
+		  exynos_m1_fp_sel")
+(define_bypass 1 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_bitops, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl"
+		 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex,\
+		  exynos_m1_neon_bitops*, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl")
+(define_bypass 3 "exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex"
+		 "exynos_m1_neon_arith_basic, exynos_m1_neon_shift_basic,\
+		  exynos_m1_neon_shift_acc, exynos_m1_neon_shift_complex,\
+		  exynos_m1_neon_bitops*, exynos_m1_neon_bitins,\
+		  exynos_m1_neon_tbl")
+(define_bypass 1 "exynos_m1_neon_fp_unary" "exynos_m1_neon_fp_unary")
+
+;; Insns in NCRYPT can feed other such insns quickly.
+(define_bypass 1 "exynos_m1_crypto_simple, exynos_m1_crypto_poly"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+(define_bypass 3 "exynos_m1_crypto_polyl"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+(define_bypass 5 "exynos_m1_crypto_complex"
+		 "exynos_m1_crypto_simple, exynos_m1_crypto_complex,\
+		  exynos_m1_crypto_poly*")
+
+;; Predicted branches take no time, but mispredicted ones take forever anyway.
+(define_bypass 1 "exynos_m1_*"
+		 "exynos_m1_call, exynos_m1_branch")