Patchwork [RFA/ARM,02/05] : LDRD generation instead of POP in A15 Thumb2 epilogue.

login
register
mail settings
Submitter Sameera Deshpande
Date Oct. 11, 2011, 9:21 a.m.
Message ID <1318324917.2186.52.camel@e102549-lin.cambridge.arm.com>
Download mbox | patch
Permalink /patch/118887/
State New
Headers show

Comments

Sameera Deshpande - Oct. 11, 2011, 9:21 a.m.
Hi!

This patch generates LDRD instead of POP for Thumb2 epilogue in A15. 

For optimize_size, original epilogue is generated for A15.
The work involves defining new functions, predicates and patterns.

As LDRD cannot be generated for PC, if PC is in register-list, LDRD is
generated for all other registers in the list which can form register
pair.
Then LDR with return is generated if PC is the only register left to be
popped, otherwise POP with return is generated.

The patch is tested with check-gcc, check-gdb and bootstrap with no
regression. 

Changelog entry for Patch to emit LDRD for thumb2 epilogue in A15:

2011-10-11  Sameera Deshpande
<sameera.deshpande@arm.com>                                             
                                                                                                       
        * config/arm/arm-protos.h (bad_reg_pair_for_thumb_ldrd_strd):
New 
          declaration.
        * config/arm/arm.c (bad_reg_pair_for_thumb_ldrd_strd): New
helper 
          function.
          (thumb2_emit_ldrd_pop): New static function.                
          (thumb2_expand_epilogue): Update functions.
        * config/arm/constraints.md (Pz): New constraint. 
        * config/arm/ldmstm.md (thumb2_ldrd_base): New pattern.
          (thumb2_ldrd): Likewise.
        * config/arm/predicates.md (ldrd_immediate_operand): New
predicate.

--
Richard Henderson - Oct. 13, 2011, 5:32 p.m.
On 10/11/2011 02:21 AM, Sameera Deshpande wrote:
> +            /* When saved-register index (i) is odd, RTXs for both the registers
> +               to be loaded are generated in above given LDRD pattern, and the
> +               pattern can be emitted now.  */
> +            par = emit_insn (par);
> +            add_reg_note (par, REG_FRAME_RELATED_EXPR, dwarf);

I don't believe REG_FRAME_RELATED_EXPR does the right thing for 
anything besides prologues.  You need to emit REG_CFA_RESTORE
for the pop inside an epilogue.


r~

Patch

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index c6b8f71..06a67b5 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -202,6 +202,7 @@  extern void thumb_reload_in_hi (rtx *);
 extern void thumb_set_return_address (rtx, rtx);
 extern const char *thumb1_output_casesi (rtx *);
 extern const char *thumb2_output_casesi (rtx *);
+extern bool bad_reg_pair_for_thumb_ldrd_strd (rtx, rtx);
 #endif
 
 /* Defined in pe.c.  */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index d709375..3eba510 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -15410,6 +15410,155 @@  arm_emit_vfp_multi_reg_pop (int first_reg, int num_regs, rtx base_reg)
   par = emit_insn (par);
   add_reg_note (par, REG_FRAME_RELATED_EXPR, dwarf);
 }
+bool
+bad_reg_pair_for_thumb_ldrd_strd (rtx src1, rtx src2)
+{
+  return (GET_CODE (src1) != REG
+          || GET_CODE (src2) != REG
+          || (REGNO (src1) == PC_REGNUM)
+          || (REGNO (src1) == SP_REGNUM)
+          || (REGNO (src1) == REGNO (src2))
+          || (REGNO (src2) == PC_REGNUM)
+          || (REGNO (src2) == SP_REGNUM));
+}
+
+/* Generate and emit a pattern that will be recognized as LDRD pattern.  If even
+   number of registers are being popped, multiple LDRD patterns are created for
+   all register pairs.  If odd number of registers are popped, last register is
+   loaded by using LDR pattern.  */
+static bool
+thumb2_emit_ldrd_pop (unsigned long saved_regs_mask, bool really_return)
+{
+  int num_regs = 0;
+  int i, j;
+  rtx par = NULL_RTX;
+  rtx dwarf = NULL_RTX;
+  rtx tmp, reg, tmp1;
+
+  for (i = 0; i <= LAST_ARM_REGNUM; i++)
+    if (saved_regs_mask & (1 << i))
+      num_regs++;
+
+  gcc_assert (num_regs && num_regs <= 16);
+  gcc_assert (really_return || ((saved_regs_mask & (1 << PC_REGNUM)) == 0));
+
+  if (really_return && (saved_regs_mask & (1 << PC_REGNUM)))
+    /* We cannot generate ldrd for PC.  Hence, reduce the count if PC is
+       to be popped.  So, if num_regs is even, now it will become odd,
+       and we can generate pop with PC.  If num_regs is odd, it will be
+       even now, and ldr with return can be generated for PC.  */
+    num_regs--;
+
+  for (i = 0, j = 0; i < (num_regs - (num_regs % 2)); j++)
+    /* Var j iterates over all the registers to gather all the registers in
+       saved_regs_mask.  Var i gives index of saved registers in stack frame.
+       A PARALLEL RTX of register-pair is created here, so that pattern for
+       LDRD can be matched.  As PC is always last register to be popped, and
+       we have already decremented num_regs if PC, we don't have to worry
+       about PC in this loop.  */
+    if (saved_regs_mask & (1 << j))
+      {
+        gcc_assert (j != SP_REGNUM);
+
+        /* Create RTX for memory load.  New RTX is created for dwarf as
+           they are not sharable.  */
+        reg = gen_rtx_REG (SImode, j);
+        tmp = gen_rtx_SET (SImode,
+                           reg,
+                           gen_frame_mem (SImode,
+                               plus_constant (stack_pointer_rtx, 4 * i)));
+
+        tmp1 = gen_rtx_SET (SImode,
+                           reg,
+                           gen_frame_mem (SImode,
+                               plus_constant (stack_pointer_rtx, 4 * i)));
+        RTX_FRAME_RELATED_P (tmp) = 1;
+        RTX_FRAME_RELATED_P (tmp1) = 1;
+
+        if (i % 2 == 0)
+          {
+            /* When saved-register index (i) is even, the RTX to be emitted is
+               yet to be created.  Hence create it first.  The LDRD pattern we
+               are generating is :
+               [ (SET (reg_t0) (MEM (PLUS (SP) (NUM))))
+                 (SET (reg_t1) (MEM (PLUS (SP) (NUM + 4)))) ]
+               where target registers need not be consecutive.  */
+            par = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
+
+            /* We need to maintain a sequence for DWARF info too.  */
+            dwarf = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
+          }
+
+        /* ith register is added in PARALLEL RTX.  If i is even, the reg_i is
+           added as 0th element and if i is odd, reg_i is added as 1st element
+           of LDRD pattern shown above.  */
+        XVECEXP (par, 0, (i % 2)) = tmp;
+        XVECEXP (dwarf, 0, (i % 2)) = tmp1;
+
+        if ((i % 2) == 1)
+          {
+            /* When saved-register index (i) is odd, RTXs for both the registers
+               to be loaded are generated in above given LDRD pattern, and the
+               pattern can be emitted now.  */
+            par = emit_insn (par);
+            add_reg_note (par, REG_FRAME_RELATED_EXPR, dwarf);
+          }
+
+        i++;
+      }
+
+  /* If the number of registers pushed is odd AND really_return is false OR
+     number of registers are even AND really_return is true, last register is
+     popped using LDR.  It can be PC as well.  Hence, adjust the stack first and
+     then LDR with post increment.  */
+
+  /* Increment the stack pointer, based on there being
+     num_regs 4-byte registers to restore.  */
+  tmp = gen_rtx_SET (VOIDmode,
+                     stack_pointer_rtx,
+                     plus_constant (stack_pointer_rtx, 4 * i));
+  RTX_FRAME_RELATED_P (tmp) = 1;
+  emit_insn (tmp);
+
+  if (((num_regs % 2) == 1 && !really_return)
+      || ((num_regs % 2) == 0 && really_return))
+    {
+      /* Gen LDR with post increment here.  */
+      for (; (saved_regs_mask & (1 << j)) == 0; j++);
+
+      tmp1 = gen_rtx_MEM (SImode,
+                          gen_rtx_POST_INC (SImode,
+                                            stack_pointer_rtx));
+      set_mem_alias_set (tmp1, get_frame_alias_set ());
+
+      reg = gen_rtx_REG (SImode, j);
+      tmp = gen_rtx_SET (SImode, reg, tmp1);
+      RTX_FRAME_RELATED_P (tmp) = 1;
+
+      if (really_return)
+        {
+          /* If really_return, j must be PC_REGNUM.  */
+          gcc_assert (j == PC_REGNUM);
+          par = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (2));
+          XVECEXP (par, 0, 0) = ret_rtx;
+          XVECEXP (par, 0, 1) = tmp;
+          emit_jump_insn (par);
+        }
+      else
+        {
+          emit_insn (tmp);
+        }
+    }
+  else if ((num_regs % 2) == 1 && really_return)
+    {
+      /* There are 2 registers to be popped.  So, generate the pattern
+         pop_multiple_with_stack_update_and_return to pop in PC.  */
+      arm_emit_multi_reg_pop (saved_regs_mask & (~((1 << j) - 1)),
+                              really_return);
+    }
+
+  return really_return;
+}
 
 /* Calculate the size of the return value that is passed in registers.  */
 static unsigned
@@ -22236,7 +22385,13 @@  thumb2_expand_epilogue (bool is_sibling)
               really_return = true;
             }
 
-          arm_emit_multi_reg_pop (saved_regs_mask, really_return);
+          if (!current_tune->prefer_ldrd_strd || optimize_size)
+            arm_emit_multi_reg_pop (saved_regs_mask, really_return);
+          else
+            /* Generate LDRD pattern instead of POP pattern.  */
+            really_return = thumb2_emit_ldrd_pop (saved_regs_mask,
+                                                  really_return);
+
           if (really_return == true)
             return;
         }
diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
index d8ce982..3c55699 100644
--- a/gcc/config/arm/constraints.md
+++ b/gcc/config/arm/constraints.md
@@ -207,6 +207,12 @@ 
   (and (match_code "const_int")
        (match_test "TARGET_THUMB2 && ival >= 0 && ival <= 255")))
 
+(define_constraint "Pz"
+  "@internal In Thumb-2 state a constant in the range -1020 to 1020"
+  (and (match_code "const_int")
+       (match_test "TARGET_THUMB2 && ival >= -1020 && ival <= 1020
+                    && ival % 4 == 0")))
+
 (define_constraint "G"
  "In ARM/Thumb-2 state a valid FPA immediate constant."
  (and (match_code "const_double")
diff --git a/gcc/config/arm/ldmstm.md b/gcc/config/arm/ldmstm.md
index 5db4a32..21d2815 100644
--- a/gcc/config/arm/ldmstm.md
+++ b/gcc/config/arm/ldmstm.md
@@ -21,6 +21,32 @@ 
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
+(define_insn "*thumb2_ldrd_base"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+        (mem:SI (match_operand:SI 1 "s_register_operand" "rk")))
+   (set (match_operand:SI 2 "register_operand" "=r")
+        (mem:SI (plus:SI (match_dup 1)
+                         (const_int 4))))]
+  "(TARGET_THUMB2 && current_tune->prefer_ldrd_strd
+     && (!bad_reg_pair_for_thumb_ldrd_strd (operands[0], operands[2])))"
+  "ldrd%?\t%0, %2, [%1]"
+  [(set_attr "type" "load2")
+   (set_attr "predicable" "yes")])
+
+(define_insn "*thumb2_ldrd"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+        (mem:SI (plus:SI (match_operand:SI 1 "s_register_operand" "rk")
+                         (match_operand:SI 2 "ldrd_immediate_operand" "Pz"))))
+   (set (match_operand:SI 3 "register_operand" "=r")
+        (mem:SI (plus:SI (match_dup 1)
+                         (match_operand:SI 4 "const_int_operand" ""))))]
+  "(TARGET_THUMB2 && current_tune->prefer_ldrd_strd
+     && ((INTVAL (operands[2]) + 4) == INTVAL (operands[4]))
+     && (!bad_reg_pair_for_thumb_ldrd_strd (operands[0], operands[3])))"
+  "ldrd%?\t%0, %3, [%1, %2]"
+  [(set_attr "type" "load2")
+   (set_attr "predicable" "yes")])
+
 (define_insn "*ldm4_ia"
   [(match_parallel 0 "load_multiple_operation"
     [(set (match_operand:SI 1 "arm_hard_register_operand" "")
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index 79e65fe..e074425 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -203,6 +203,10 @@ 
 	    (match_test "(GET_CODE (op) != CONST_INT
 			  || (INTVAL (op) < 4096 && INTVAL (op) > -4096))"))))
 
+(define_predicate "ldrd_immediate_operand"
+  (and (match_operand 0 "const_int_operand")
+  (match_test "(INTVAL (op) < 1020 && INTVAL (op) > -1020)")))
+
 ;; True for operators that can be combined with a shift in ARM state.
 (define_special_predicate "shiftable_operator"
   (and (match_code "plus,minus,ior,xor,and")