@@ -102,6 +102,7 @@ extern int arm_early_load_addr_dep (rtx, rtx);
extern int arm_no_early_alu_shift_dep (rtx, rtx);
extern int arm_no_early_alu_shift_value_dep (rtx, rtx);
extern int arm_no_early_mul_dep (rtx, rtx);
+extern int arm_mac_accumulator_is_result (rtx, rtx);
extern int arm_mac_accumulator_is_mul_result (rtx, rtx);
extern int tls_mentioned_p (rtx);
@@ -24610,6 +24610,62 @@ arm_cxx_guard_type (void)
return TARGET_AAPCS_BASED ? integer_type_node : long_long_integer_type_node;
}
+/* Return non-zero iff the consumer (a multiply-accumulate or a
+ multiple-subtract instruction) has an accumulator dependency on the
+ result of the producer and no other dependency on that result. It
+ does not check if the producer is multiply-accumulate instruction. */
+int
+arm_mac_accumulator_is_result (rtx producer, rtx consumer)
+{
+ rtx result;
+ rtx op0, op1, acc;
+
+ producer = PATTERN (producer);
+ consumer = PATTERN (consumer);
+
+ if (GET_CODE (producer) == COND_EXEC)
+ producer = COND_EXEC_CODE (producer);
+ if (GET_CODE (consumer) == COND_EXEC)
+ consumer = COND_EXEC_CODE (consumer);
+
+ if (GET_CODE (producer) != SET)
+ return 0;
+
+ result = XEXP (producer, 0);
+
+ if (GET_CODE (consumer) != SET)
+ return 0;
+
+ /* Check that the consumer is of the form
+ (set (...) (plus (mult ...) (...)))
+ or
+ (set (...) (minus (...) (mult ...))). */
+ if (GET_CODE (XEXP (consumer, 1)) == PLUS)
+ {
+ if (GET_CODE (XEXP (XEXP (consumer, 1), 0)) != MULT)
+ return 0;
+
+ op0 = XEXP (XEXP (XEXP (consumer, 1), 0), 0);
+ op1 = XEXP (XEXP (XEXP (consumer, 1), 0), 1);
+ acc = XEXP (XEXP (consumer, 1), 1);
+ }
+ else if (GET_CODE (XEXP (consumer, 1)) == MINUS)
+ {
+ if (GET_CODE (XEXP (XEXP (consumer, 1), 1)) != MULT)
+ return 0;
+
+ op0 = XEXP (XEXP (XEXP (consumer, 1), 1), 0);
+ op1 = XEXP (XEXP (XEXP (consumer, 1), 1), 1);
+ acc = XEXP (XEXP (consumer, 1), 0);
+ }
+ else
+ return 0;
+
+ return (reg_overlap_mentioned_p (result, acc)
+ && !reg_overlap_mentioned_p (result, op0)
+ && !reg_overlap_mentioned_p (result, op1));
+}
+
/* Return non-zero if the consumer (a multiply-accumulate instruction)
has an accumulator dependency on the result of the producer (a
multiplication instruction) and no other dependency on that result. */
@@ -137,6 +137,12 @@
(eq_attr "neon_type" "none")))
"cortex_a7_both")
+;; Forward the result of a multiply operation to the accumulator
+;; of the following multiply and accumulate instruction.
+(define_bypass 1 "cortex_a7_mul"
+ "cortex_a7_mul"
+ "arm_mac_accumulator_is_result")
+
;; The latency depends on the operands, so we use an estimate here.
(define_insn_reservation "cortex_a7_idiv" 5
(and (eq_attr "tune" "cortexa7")
@@ -264,6 +271,10 @@
neon_fp_vmla_qqq_scalar"))
"cortex_a7_both+cortex_a7_fpmul_pipe")
+(define_bypass 4 "cortex_a7_fpmacs,cortex_a7_neon_mla"
+ "cortex_a7_fpmacs,cortex_a7_neon_mla"
+ "arm_mac_accumulator_is_result")
+
;; Non-multiply instructions can issue between two cycles of a
;; double-precision multiply.
@@ -285,6 +296,10 @@
(eq_attr "neon_type" "none")))
"cortex_a7_ex1+cortex_a7_fpmul_pipe, cortex_a7_fpmul_pipe*4")
+(define_bypass 7 "cortex_a7_fpmacd"
+ "cortex_a7_fpmacd,cortex_a7_fpfmad"
+ "arm_mac_accumulator_is_result")
+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Floating-point divide/square root instructions.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;