diff mbox

[ARM,3/5] New bypass between mac operations in cortex-a7 pipeline description

Message ID 001201cdfb29$16052800$420f7800$@yorsh@arm.com
State New
Headers show

Commit Message

Greta Yorsh Jan. 25, 2013, 6:23 p.m. UTC
Add bypasses to forward the result of one MAC operation to the accumulator
of another MAC operation.

Towards this end, we add a new function arm_mac_accumulator_is_result to be
used as a guard for bypasses. Existing guard
arm_mac_accumulator_is_mul_result requires a multiply operation as the
producer and a multiply-accumulate operation as the consumer. The new guard
allows more general producers and consumers. It allows the consumer to be a
multiply-accumulate or multiply-subtract operation. It allows the producer
to be any SET operation, although only MAC operations are used as producers
in the pipeline description of Cortex-A7.

gcc/

2013-01-03  Greta Yorsh  <Greta.Yorsh@arm.com>

        * config/arm/arm-protos.h (arm_mac_accumulator_is_result): New
        declaration.
        * config/arm/arm.c (arm_mac_accumulator_is_result): New function.
        * config/arm/cortex-a7.md: New bypasses using
        arm_mac_accumulator_is_result.

Comments

Ramana Radhakrishnan Jan. 29, 2013, 5:47 p.m. UTC | #1
On 01/25/13 18:23, Greta Yorsh wrote:
> Add bypasses to forward the result of one MAC operation to the accumulator
> of another MAC operation.
>
> Towards this end, we add a new function arm_mac_accumulator_is_result to be
> used as a guard for bypasses. Existing guard
> arm_mac_accumulator_is_mul_result requires a multiply operation as the
> producer and a multiply-accumulate operation as the consumer. The new guard
> allows more general producers and consumers. It allows the consumer to be a
> multiply-accumulate or multiply-subtract operation. It allows the producer
> to be any SET operation, although only MAC operations are used as producers
> in the pipeline description of Cortex-A7.
>
> gcc/
>
> 2013-01-03  Greta Yorsh  <Greta.Yorsh@arm.com>
>
>          * config/arm/arm-protos.h (arm_mac_accumulator_is_result): New
>          declaration.
>          * config/arm/arm.c (arm_mac_accumulator_is_result): New function.
>          * config/arm/cortex-a7.md: New bypasses using
>          arm_mac_accumulator_is_result.
>

Ok as it only affects Cortex-A7.

Ramana
diff mbox

Patch

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 4c61e35..885ccff 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -102,6 +102,7 @@  extern int arm_early_load_addr_dep (rtx, rtx);
 extern int arm_no_early_alu_shift_dep (rtx, rtx);
 extern int arm_no_early_alu_shift_value_dep (rtx, rtx);
 extern int arm_no_early_mul_dep (rtx, rtx);
+extern int arm_mac_accumulator_is_result (rtx, rtx);
 extern int arm_mac_accumulator_is_mul_result (rtx, rtx);
 
 extern int tls_mentioned_p (rtx);
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 13d745f..39f1eb3 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -24610,6 +24610,62 @@  arm_cxx_guard_type (void)
   return TARGET_AAPCS_BASED ? integer_type_node : long_long_integer_type_node;
 }
 
+/* Return non-zero iff the consumer (a multiply-accumulate or a
+   multiple-subtract instruction) has an accumulator dependency on the
+   result of the producer and no other dependency on that result.  It
+   does not check if the producer is multiply-accumulate instruction.  */
+int
+arm_mac_accumulator_is_result (rtx producer, rtx consumer)
+{
+  rtx result;
+  rtx op0, op1, acc;
+
+  producer = PATTERN (producer);
+  consumer = PATTERN (consumer);
+
+  if (GET_CODE (producer) == COND_EXEC)
+    producer = COND_EXEC_CODE (producer);
+  if (GET_CODE (consumer) == COND_EXEC)
+    consumer = COND_EXEC_CODE (consumer);
+
+  if (GET_CODE (producer) != SET)
+    return 0;
+
+  result = XEXP (producer, 0);
+
+  if (GET_CODE (consumer) != SET)
+    return 0;
+
+  /* Check that the consumer is of the form
+     (set (...) (plus (mult ...) (...)))
+     or
+     (set (...) (minus (...) (mult ...))).  */
+  if (GET_CODE (XEXP (consumer, 1)) == PLUS)
+    {
+      if (GET_CODE (XEXP (XEXP (consumer, 1), 0)) != MULT)
+        return 0;
+
+      op0 = XEXP (XEXP (XEXP (consumer, 1), 0), 0);
+      op1 = XEXP (XEXP (XEXP (consumer, 1), 0), 1);
+      acc = XEXP (XEXP (consumer, 1), 1);
+    }
+  else if (GET_CODE (XEXP (consumer, 1)) == MINUS)
+    {
+      if (GET_CODE (XEXP (XEXP (consumer, 1), 1)) != MULT)
+        return 0;
+
+      op0 = XEXP (XEXP (XEXP (consumer, 1), 1), 0);
+      op1 = XEXP (XEXP (XEXP (consumer, 1), 1), 1);
+      acc = XEXP (XEXP (consumer, 1), 0);
+    }
+  else
+    return 0;
+
+  return (reg_overlap_mentioned_p (result, acc)
+          && !reg_overlap_mentioned_p (result, op0)
+          && !reg_overlap_mentioned_p (result, op1));
+}
+
 /* Return non-zero if the consumer (a multiply-accumulate instruction)
    has an accumulator dependency on the result of the producer (a
    multiplication instruction) and no other dependency on that result.  */
diff --git a/gcc/config/arm/cortex-a7.md b/gcc/config/arm/cortex-a7.md
index 930242d..2cef5fd 100644
--- a/gcc/config/arm/cortex-a7.md
+++ b/gcc/config/arm/cortex-a7.md
@@ -137,6 +137,12 @@ 
             (eq_attr "neon_type" "none")))
   "cortex_a7_both")
 
+;; Forward the result of a multiply operation to the accumulator 
+;; of the following multiply and accumulate instruction.
+(define_bypass 1 "cortex_a7_mul"
+                 "cortex_a7_mul"
+                 "arm_mac_accumulator_is_result")
+
 ;; The latency depends on the operands, so we use an estimate here.
 (define_insn_reservation "cortex_a7_idiv" 5
   (and (eq_attr "tune" "cortexa7")
@@ -264,6 +271,10 @@ 
                  neon_fp_vmla_qqq_scalar"))
   "cortex_a7_both+cortex_a7_fpmul_pipe")
 
+(define_bypass 4 "cortex_a7_fpmacs,cortex_a7_neon_mla"
+                 "cortex_a7_fpmacs,cortex_a7_neon_mla"
+                 "arm_mac_accumulator_is_result")
+
 ;; Non-multiply instructions can issue between two cycles of a
 ;; double-precision multiply. 
 
@@ -285,6 +296,10 @@ 
             (eq_attr "neon_type" "none")))
   "cortex_a7_ex1+cortex_a7_fpmul_pipe, cortex_a7_fpmul_pipe*4")
 
+(define_bypass 7 "cortex_a7_fpmacd"
+                 "cortex_a7_fpmacd,cortex_a7_fpfmad"
+                 "arm_mac_accumulator_is_result")
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Floating-point divide/square root instructions.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;