===================================================================
@@ -74,6 +74,7 @@ type availability = Source of int
| Dest_n_after of int * int
type guard = Guard_none | Guard_only_m | Guard_only_n | Guard_only_d
+ | Guard_writeback | Guard_writeback_only
(* Reservation behaviors. All but the last row here correspond to one
pipeline each. Each constructor will correspond to one
@@ -240,6 +241,9 @@ let availability_table = [
(* MRC instructions are in the .tpl file. *)
]
+(* The latency to use on all address register writeback dependencies. *)
+let writeback_latency = 1
+
(* Augment the tuples in the availability table with an extra component
that describes the earliest stage where a source operand may be
required. (It is also possible that an entry in the table has no
@@ -355,9 +359,15 @@ let pick_latency largest worst guards =
of one bypass from this producer to any particular consumer listed
in LATENCIES.) Use a hash table to collate bypasses with the
same latency and guard. *)
-let collate_bypasses (producer_name, _, _, _) largest latencies core =
+let collate_bypasses (producer_name, _, resource, _) largest latencies core =
let ht = Hashtbl.create 42 in
let keys = ref [] in
+ let add_latency consumer (guard, latency) =
+ if (try ignore (Hashtbl.find ht (guard, latency)); false
+ with Not_found -> true)
+ then keys := (guard, latency) :: !keys;
+ Hashtbl.add ht (guard, latency) ((coreStr core) ^ "_" ^ consumer)
+ in
List.iter (
fun ((consumer, _, _, _), worst, guards) ->
(* Find out which latency to use. Ignoring latencies that match
@@ -369,14 +379,42 @@ let collate_bypasses (producer_name, _,
let guard_latency_opt = pick_latency largest worst guards in
match guard_latency_opt with
None -> ()
- | Some (guard, latency) ->
- begin
- (if (try ignore (Hashtbl.find ht (guard, latency)); false
- with Not_found -> true) then
- keys := (guard, latency) :: !keys);
- Hashtbl.add ht (guard, latency) ((coreStr core) ^ "_" ^ consumer)
- end
+ | Some pair -> add_latency consumer pair
) latencies;
+ (* Add in the writeback dependencies for loads and stores. *)
+ begin
+ match resource with
+ Ls _ ->
+ if largest > writeback_latency then
+ (* Having a writeback-only dependency decreases the latency. *)
+ begin
+ (* We don't handle cases where the largest latency is
+ greater than writeback_latency and where the smallest
+ is less. *)
+ List.iter
+ (fun (guard, latency) -> assert (latency >= writeback_latency))
+ !keys;
+ add_latency "*" (Guard_writeback_only, writeback_latency)
+ end
+ else if largest < writeback_latency or !keys <> [] then
+ (* Having a writeback dependency either increases the latency
+ or reenforces the default latency. A bypass in only required
+ in the latter case if there are other bypasses too. *)
+ add_latency "*" (Guard_writeback, writeback_latency)
+
+ | _ ->
+ ()
+ end;
+ (* A comparison function that sorts keys in order of decreasing latency.
+ The guard order isn't interesting but is needed to stabilise the
+ sort. *)
+ let comp_fn (guard1, latency1) (guard2, latency2) =
+ if latency1 > latency2 then -1
+ else if latency1 < latency2 then 1
+ else if guard1 > guard2 then -1
+ else if guard2 > guard1 then 1
+ else 0
+ in
(* The hash table now has bypasses collated so that ones with the
same latency and guard have the same keys. Walk through all the
keys, extract the associated bypasses, and concatenate the names
@@ -388,7 +426,7 @@ let collate_bypasses (producer_name, _,
String.concat ",\\\n " consumers,
latency,
guard)
- ) !keys
+ ) (List.sort comp_fn !keys)
(* For every producer, find the worst-case latency between it and
*any* consumer. Also determine (if such a thing exists) the
@@ -505,6 +543,8 @@ let guard_fn g =
Guard_only_m -> "arm_neon_only_m_dependency"
| Guard_only_n -> "arm_neon_only_n_dependency"
| Guard_only_d -> "arm_neon_only_d_dependency"
+ | Guard_writeback -> "arm_writeback_dep"
+ | Guard_writeback_only -> "arm_writeback_only_dep"
| Guard_none -> assert false
(* Emit a define_bypass for each bypass. *)
===================================================================
@@ -638,6 +638,18 @@ (define_bypass 2 "cortex_a8_neon_vld3_vl
cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a8_neon_vld3_vld4_all_lanes"
+ "cortex_a8_*"
+ "arm_writeback_only_dep")
+
+(define_bypass 1 "cortex_a8_neon_vst3_vst4_lane"
+ "cortex_a8_*"
+ "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a8_neon_vst1_vst2_lane"
+ "cortex_a8_*"
+ "arm_writeback_dep")
+
(define_bypass 5 "cortex_a8_neon_vld3_vld4_lane"
"cortex_a8_neon_int_1,\
cortex_a8_neon_int_4,\
@@ -652,6 +664,10 @@ (define_bypass 5 "cortex_a8_neon_vld3_vl
cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a8_neon_vld3_vld4_lane"
+ "cortex_a8_*"
+ "arm_writeback_only_dep")
+
(define_bypass 3 "cortex_a8_neon_vld1_vld2_lane"
"cortex_a8_neon_int_1,\
cortex_a8_neon_int_4,\
@@ -666,6 +682,26 @@ (define_bypass 3 "cortex_a8_neon_vld1_vl
cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a8_neon_vld1_vld2_lane"
+ "cortex_a8_*"
+ "arm_writeback_only_dep")
+
+(define_bypass 1 "cortex_a8_neon_vst3_vst4"
+ "cortex_a8_*"
+ "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a8_neon_vst2_4_regs_vst3_vst4"
+ "cortex_a8_*"
+ "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a8_neon_vst1_3_4_regs"
+ "cortex_a8_*"
+ "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a8_neon_vst1_1_2_regs_vst2_2_regs"
+ "cortex_a8_*"
+ "arm_writeback_dep")
+
(define_bypass 4 "cortex_a8_neon_vld3_vld4"
"cortex_a8_neon_int_1,\
cortex_a8_neon_int_4,\
@@ -680,6 +716,10 @@ (define_bypass 4 "cortex_a8_neon_vld3_vl
cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a8_neon_vld3_vld4"
+ "cortex_a8_*"
+ "arm_writeback_only_dep")
+
(define_bypass 3 "cortex_a8_neon_vld2_4_regs"
"cortex_a8_neon_int_1,\
cortex_a8_neon_int_4,\
@@ -694,6 +734,10 @@ (define_bypass 3 "cortex_a8_neon_vld2_4_
cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a8_neon_vld2_4_regs"
+ "cortex_a8_*"
+ "arm_writeback_only_dep")
+
(define_bypass 2 "cortex_a8_neon_vld2_2_regs_vld1_vld2_all_lanes"
"cortex_a8_neon_int_1,\
cortex_a8_neon_int_4,\
@@ -708,6 +752,10 @@ (define_bypass 2 "cortex_a8_neon_vld2_2_
cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a8_neon_vld2_2_regs_vld1_vld2_all_lanes"
+ "cortex_a8_*"
+ "arm_writeback_only_dep")
+
(define_bypass 2 "cortex_a8_neon_vld1_3_4_regs"
"cortex_a8_neon_int_1,\
cortex_a8_neon_int_4,\
@@ -722,6 +770,14 @@ (define_bypass 2 "cortex_a8_neon_vld1_3_
cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a8_neon_vld1_3_4_regs"
+ "cortex_a8_*"
+ "arm_writeback_only_dep")
+
+(define_bypass 1 "cortex_a8_neon_vld1_1_2_regs"
+ "cortex_a8_*"
+ "arm_writeback_only_dep")
+
(define_bypass 1 "cortex_a8_neon_vld1_1_2_regs"
"cortex_a8_neon_int_1,\
cortex_a8_neon_int_4,\
@@ -736,6 +792,14 @@ (define_bypass 1 "cortex_a8_neon_vld1_1_
cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a8_neon_str"
+ "cortex_a8_*"
+ "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a8_neon_ldr"
+ "cortex_a8_*"
+ "arm_writeback_dep")
+
(define_bypass 0 "cortex_a8_neon_ldr"
"cortex_a8_neon_int_1,\
cortex_a8_neon_int_4,\
===================================================================
@@ -563,6 +563,18 @@ (define_bypass 2 "cortex_a9_neon_vld3_vl
cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a9_neon_vld3_vld4_all_lanes"
+ "cortex_a9_*"
+ "arm_writeback_only_dep")
+
+(define_bypass 1 "cortex_a9_neon_vst3_vst4_lane"
+ "cortex_a9_*"
+ "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a9_neon_vst1_vst2_lane"
+ "cortex_a9_*"
+ "arm_writeback_dep")
+
(define_bypass 5 "cortex_a9_neon_vld3_vld4_lane"
"cortex_a9_neon_int_1,\
cortex_a9_neon_int_4,\
@@ -577,6 +589,10 @@ (define_bypass 5 "cortex_a9_neon_vld3_vl
cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a9_neon_vld3_vld4_lane"
+ "cortex_a9_*"
+ "arm_writeback_only_dep")
+
(define_bypass 3 "cortex_a9_neon_vld1_vld2_lane"
"cortex_a9_neon_int_1,\
cortex_a9_neon_int_4,\
@@ -591,6 +607,26 @@ (define_bypass 3 "cortex_a9_neon_vld1_vl
cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a9_neon_vld1_vld2_lane"
+ "cortex_a9_*"
+ "arm_writeback_only_dep")
+
+(define_bypass 1 "cortex_a9_neon_vst3_vst4"
+ "cortex_a9_*"
+ "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a9_neon_vst2_4_regs_vst3_vst4"
+ "cortex_a9_*"
+ "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a9_neon_vst1_3_4_regs"
+ "cortex_a9_*"
+ "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a9_neon_vst1_1_2_regs_vst2_2_regs"
+ "cortex_a9_*"
+ "arm_writeback_dep")
+
(define_bypass 4 "cortex_a9_neon_vld3_vld4"
"cortex_a9_neon_int_1,\
cortex_a9_neon_int_4,\
@@ -605,6 +641,10 @@ (define_bypass 4 "cortex_a9_neon_vld3_vl
cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a9_neon_vld3_vld4"
+ "cortex_a9_*"
+ "arm_writeback_only_dep")
+
(define_bypass 3 "cortex_a9_neon_vld2_4_regs"
"cortex_a9_neon_int_1,\
cortex_a9_neon_int_4,\
@@ -619,6 +659,10 @@ (define_bypass 3 "cortex_a9_neon_vld2_4_
cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a9_neon_vld2_4_regs"
+ "cortex_a9_*"
+ "arm_writeback_only_dep")
+
(define_bypass 2 "cortex_a9_neon_vld2_2_regs_vld1_vld2_all_lanes"
"cortex_a9_neon_int_1,\
cortex_a9_neon_int_4,\
@@ -633,6 +677,10 @@ (define_bypass 2 "cortex_a9_neon_vld2_2_
cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a9_neon_vld2_2_regs_vld1_vld2_all_lanes"
+ "cortex_a9_*"
+ "arm_writeback_only_dep")
+
(define_bypass 2 "cortex_a9_neon_vld1_3_4_regs"
"cortex_a9_neon_int_1,\
cortex_a9_neon_int_4,\
@@ -647,6 +695,14 @@ (define_bypass 2 "cortex_a9_neon_vld1_3_
cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a9_neon_vld1_3_4_regs"
+ "cortex_a9_*"
+ "arm_writeback_only_dep")
+
+(define_bypass 1 "cortex_a9_neon_vld1_1_2_regs"
+ "cortex_a9_*"
+ "arm_writeback_only_dep")
+
(define_bypass 1 "cortex_a9_neon_vld1_1_2_regs"
"cortex_a9_neon_int_1,\
cortex_a9_neon_int_4,\
@@ -661,6 +717,14 @@ (define_bypass 1 "cortex_a9_neon_vld1_1_
cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
+(define_bypass 1 "cortex_a9_neon_str"
+ "cortex_a9_*"
+ "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a9_neon_ldr"
+ "cortex_a9_*"
+ "arm_writeback_dep")
+
(define_bypass 0 "cortex_a9_neon_ldr"
"cortex_a9_neon_int_1,\
cortex_a9_neon_int_4,\