diff mbox

Add zero-overhead looping for xtensa backend

Message ID CAFc0fxw_SrT8c5y=Ly+X98UhN5_P1bAjMg3n0K06fxAv0ioebg@mail.gmail.com
State New
Headers show

Commit Message

Felix Yang Jan. 8, 2014, 4:27 p.m. UTC
Hi Sterling,

  This patch implements zero-overhead looping for xtensa backend using
hw-doloop facility.
  If OK for trunk, please apply it for me. Thanks.


+}
+
 #include "gt-xtensa.h"

Cheers,
Felix
Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 206431)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,18 @@
+2014-01-09  Felix Yang  <fei.yang0953@gmail.com>
+
+	* config/xtensa/xtensa.c (xtensa_reorg): New.
+	(xtensa_reorg_loops): New.
+	(xtensa_can_use_doloop_p): New.
+	(xtensa_invalid_within_doloop): New.
+	(hwloop_optimize): New.
+	(hwloop_fail): New.
+	(hwloop_pattern_reg): New.
+	(xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
+	(xtensa_doloop_hooks): Define.
+	* config/xtensa/xtensa.md (doloop_end): New.
+	(zero_cost_loop_start): Rewritten.
+	(zero_cost_loop_end): Rewritten.
+
 2014-01-08  Marek Polacek  <polacek@redhat.com>
 
 	PR middle-end/59669
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md	(revision 206431)
+++ gcc/config/xtensa/xtensa.md	(working copy)
@@ -35,6 +35,8 @@
   (UNSPEC_TLS_CALL	9)
   (UNSPEC_TP		10)
   (UNSPEC_MEMW		11)
+  (UNSPEC_LSETUP_START  12)
+  (UNSPEC_LSETUP_END    13)
 
   (UNSPECV_SET_FP	1)
   (UNSPECV_ENTRY	2)
@@ -1289,6 +1291,8 @@
    (set_attr "length"	"3")])
 
 
+;; Hardware loop support.
+
 ;; Define the loop insns used by bct optimization to represent the
 ;; start and end of a zero-overhead loop (in loop.c).  This start
 ;; template generates the loop insn; the end template doesn't generate
@@ -1296,34 +1300,58 @@
 
 (define_insn "zero_cost_loop_start"
   [(set (pc)
-	(if_then_else (eq (match_operand:SI 0 "register_operand" "a")
-			  (const_int 0))
-		      (label_ref (match_operand 1 "" ""))
-		      (pc)))
-   (set (reg:SI 19)
-	(plus:SI (match_dup 0) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 2 "nonimmediate_operand" "0")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 0 "nonimmediate_operand" "=a")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
   ""
-  "loopnez\t%0, %l1"
+  "loop\t%0, %l1_LEND"
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"3")])
 
 (define_insn "zero_cost_loop_end"
   [(set (pc)
-	(if_then_else (ne (reg:SI 19) (const_int 0))
-		      (label_ref (match_operand 0 "" ""))
-		      (pc)))
-   (set (reg:SI 19)
-	(plus:SI (reg:SI 19) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 2 "nonimmediate_operand" "0")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 0 "nonimmediate_operand" "=a")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
   ""
 {
-    xtensa_emit_loop_end (insn, operands);
-    return "";
+  xtensa_emit_loop_end (insn, operands);
+  return "";
 }
   [(set_attr "type"	"jump")
    (set_attr "mode"	"none")
    (set_attr "length"	"0")])
 
+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+  [(parallel [(set (pc) (if_then_else
+                          (ne (match_operand:SI 0 "" "")
+                              (const_int 1))
+                          (label_ref (match_operand 1 "" ""))
+                          (pc)))
+              (set (match_dup 0)
+                   (plus:SI (match_dup 0)
+                            (const_int -1)))
+              (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
+  ""
+{
+  /* The loop optimizer doesn't check the predicates... */
+  if (GET_MODE (operands[0]) != SImode)
+    FAIL;
+})
+
 
 ;; Setting a register from a comparison.
 
Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c	(revision 206431)
+++ gcc/config/xtensa/xtensa.c	(working copy)
@@ -1,6 +1,7 @@
 /* Subroutines for insn-output.c for Tensilica's Xtensa architecture.
    Copyright (C) 2001-2014 Free Software Foundation, Inc.
    Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
+   Zero-overhead looping support by Felix Yang (felix.yang0953@gmail.com).
 
 This file is part of GCC.
 
@@ -61,8 +62,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple.h"
 #include "gimplify.h"
 #include "df.h"
+#include "hw-doloop.h"
+#include "dumpfile.h"
 
-
 /* Enumeration for all of the relational tests, so that we can build
    arrays indexed by the test type, and not worry about the order
    of EQ, NE, etc.  */
@@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool,
 
 static bool constantpool_address_p (const_rtx addr);
 static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (double_int, double_int iterations_max,
+                                     unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const_rtx);
 
 static bool xtensa_member_type_forces_blk (const_tree,
 					   enum machine_mode mode);
@@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p
 
+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 
@@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx insn, rtx *operands)
         }
     }
 
-  output_asm_insn ("# loop end for %0", operands);
+  output_asm_insn ("%1_LEND:", operands);
 }
 
 
@@ -3709,4 +3724,224 @@ xtensa_legitimate_constant_p (enum machine_mode mo
   return !xtensa_tls_referenced_p (x);
 }
 
+/* Implement TARGET_CAN_USE_DOLOOP_P.  */
+
+static bool
+xtensa_can_use_doloop_p (double_int, double_int,
+                         unsigned int level, bool entered_at_top)
+{
+  /* Considering limitations in the hardware, only use doloop for innermost loops
+     which must be entered from the top.  */
+  if (level != 1 || !entered_at_top)
+    return false;
+
+  return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+   Otherwise return why doloop cannot be applied.  */
+
+static const char *
+xtensa_invalid_within_doloop (const_rtx insn)
+{
+  if (CALL_P (insn))
+    return "Function call in the loop.";
+
+  return NULL;
+}
+
+/* Optimize LOOP.  */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+  int i;
+  edge entry_edge;
+  basic_block entry_bb;
+  rtx insn, seq, iter_reg, entry_after;
+
+  if (loop->depth > 1)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not innermost\n", loop->loop_no);
+      return false;
+    }
+
+  if (!loop->incoming_dest)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has more than one entry\n", loop->loop_no);
+      return false;
+    }
+
+  if (loop->incoming_dest != loop->head)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not entered from head\n", loop->loop_no);
+      return false;
+    }
+
+  if (loop->has_call || loop->has_asm)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has invalid insn\n", loop->loop_no);
+      return false;
+    }
+
+  /* Scan all the blocks to make sure they don't use iter_reg.  */
+  if (loop->iter_reg_used || loop->iter_reg_used_outside)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d uses iterator\n", loop->loop_no);
+      return false;
+    }
+
+  /* Check if start_label appears before doloop_end.  */
+  insn = loop->start_label;
+  while (insn && insn != loop->loop_end)
+    insn = NEXT_INSN (insn);
+
+  if (!insn)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Get the loop iteration register.  */
+  iter_reg = loop->iter_reg;
+
+  gcc_assert (REG_P (iter_reg));
+
+  entry_edge = NULL;
+
+  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+    if (entry_edge->flags & EDGE_FALLTHRU)
+      break;
+
+  if (entry_edge == NULL)
+    return false;
+
+  /* Place the zero_cost_loop_start instruction before the loop.  */
+  entry_bb = entry_edge->src;
+
+  start_sequence ();
+
+  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+                                              loop->start_label,
+                                              loop->iter_reg));
+
+  seq = get_insns ();
+
+  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+    {
+      basic_block new_bb;
+      edge e;
+      edge_iterator ei;
+
+      emit_insn_before (seq, BB_HEAD (loop->head));
+      seq = emit_label_before (gen_label_rtx (), seq);
+
+      new_bb = create_basic_block (seq, insn, entry_bb);
+      FOR_EACH_EDGE (e, ei, loop->incoming)
+        {
+          if (!(e->flags & EDGE_FALLTHRU))
+            redirect_edge_and_branch_force (e, new_bb);
+          else
+            redirect_edge_succ (e, new_bb);
+        }
+      make_edge (new_bb, loop->head, 0);
+    }
+  else
+    {
+      entry_after = BB_END (entry_bb);
+      while (DEBUG_INSN_P (entry_after)
+             || (NOTE_P (entry_after)
+                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+        entry_after = PREV_INSN (entry_after);
+      emit_insn_after (seq, entry_after);
+    }
+
+  end_sequence (); 
+
+  return true;
+}
+
+/* A callback for the hw-doloop pass.  Called when a loop we have discovered
+   turns out not to be optimizable; we have to split the loop_end pattern into
+   a subtract and a test.  */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+  rtx test, insn = loop->loop_end;
+
+  emit_insn_before (gen_addsi3 (loop->iter_reg,
+                                loop->iter_reg,
+                                constm1_rtx),
+                    loop->loop_end);
+
+  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+                                                loop->iter_reg, const0_rtx,
+                                                loop->start_label),
+                                loop->loop_end);
+
+  JUMP_LABEL (insn) = loop->start_label;
+  LABEL_NUSES (loop->start_label)++;
+  delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass.  This function examines INSN; if
+   it is a doloop_end pattern we recognize, return the reg rtx for the
+   loop counter.  Otherwise, return NULL_RTX.  */
+
+static rtx
+hwloop_pattern_reg (rtx insn)
+{
+  rtx reg;
+
+  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_zero_cost_loop_end)
+    return NULL_RTX;
+
+  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+  if (!REG_P (reg))
+    return NULL_RTX;
+  return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+  hwloop_pattern_reg,
+  hwloop_optimize,
+  hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+   and tries to rewrite the RTL of these loops so that proper Xtensa 
+   hardware loops are generated.  */
+
+static void
+xtensa_reorg_loops (void)
+{
+  reorg_loops (true, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
+
+static void
+xtensa_reorg (void)
+{
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+  compute_bb_for_insn ();
+
+  df_analyze ();
+
+  /* Doloop optimization.  */
+  xtensa_reorg_loops ();
+}
+
 #include "gt-xtensa.h"

Comments

augustine.sterling@gmail.com Jan. 8, 2014, 4:49 p.m. UTC | #1
On Wed, Jan 8, 2014 at 8:27 AM, Felix Yang <fei.yang0953@gmail.com> wrote:
> Hi Sterling,
>
>   This patch implements zero-overhead looping for xtensa backend using
> hw-doloop facility.
>   If OK for trunk, please apply it for me. Thanks.

Hi Felix,

I last worked on zero-overhead loops for Xtensa in the gcc 4.3
timeframe, but when I did, I ran into several problems related to
later optimizations rearranging the code which I didn't have time to
address.

I'm sure much of that experience is completely stale now, but I would
appreciate a detail of the testing you have done with this patch (in
particular, a description of the different xtensa configurations you
tested it against, especially the ones with and without loop
instructions) before I approve it. Please be sure the assembler can
relax the loops it generates as well. I don't see any particular
problem, but there are many, many gotchas when dealing with xtensa
loop instructions.

It also appears that Tensilica has stopped posting test results for
Xtensa, which makes it difficult to evaluate the quality of this
patch.

Thanks,

Sterling
diff mbox

Patch

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog    (revision 206431)
+++ gcc/ChangeLog    (working copy)
@@ -1,3 +1,18 @@ 
+2014-01-08  Felix Yang  <fei.yang0953@gmail.com>
+
+    * config/xtensa/xtensa.c (xtensa_reorg): New.
+    (xtensa_reorg_loops): New.
+    (xtensa_can_use_doloop_p): New.
+    (xtensa_invalid_within_doloop): New.
+    (hwloop_optimize): New.
+    (hwloop_fail): New.
+    (hwloop_pattern_reg): New.
+    (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label.
+    (xtensa_doloop_hooks): Define.
+    * config/xtensa/xtensa.md (doloop_end): New.
+    (zero_cost_loop_start): Rewritten.
+    (zero_cost_loop_end): Rewritten.
+
 2014-01-08  Marek Polacek  <polacek@redhat.com>

     PR middle-end/59669
Index: gcc/config/xtensa/xtensa.md
===================================================================
--- gcc/config/xtensa/xtensa.md    (revision 206431)
+++ gcc/config/xtensa/xtensa.md    (working copy)
@@ -35,6 +35,8 @@ 
   (UNSPEC_TLS_CALL    9)
   (UNSPEC_TP        10)
   (UNSPEC_MEMW        11)
+  (UNSPEC_LSETUP_START  12)
+  (UNSPEC_LSETUP_END    13)

   (UNSPECV_SET_FP    1)
   (UNSPECV_ENTRY    2)
@@ -1289,6 +1291,8 @@ 
    (set_attr "length"    "3")])


+;; Hardware loop support.
+
 ;; Define the loop insns used by bct optimization to represent the
 ;; start and end of a zero-overhead loop (in loop.c).  This start
 ;; template generates the loop insn; the end template doesn't generate
@@ -1296,34 +1300,58 @@ 

 (define_insn "zero_cost_loop_start"
   [(set (pc)
-    (if_then_else (eq (match_operand:SI 0 "register_operand" "a")
-              (const_int 0))
-              (label_ref (match_operand 1 "" ""))
-              (pc)))
-   (set (reg:SI 19)
-    (plus:SI (match_dup 0) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 2 "nonimmediate_operand" "0")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 0 "nonimmediate_operand" "=a")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_START)]
   ""
-  "loopnez\t%0, %l1"
+  "loop\t%0, %l1_LEND"
   [(set_attr "type"    "jump")
    (set_attr "mode"    "none")
    (set_attr "length"    "3")])

 (define_insn "zero_cost_loop_end"
   [(set (pc)
-    (if_then_else (ne (reg:SI 19) (const_int 0))
-              (label_ref (match_operand 0 "" ""))
-              (pc)))
-   (set (reg:SI 19)
-    (plus:SI (reg:SI 19) (const_int -1)))]
+        (if_then_else (ne (match_operand:SI 2 "nonimmediate_operand" "0")
+                          (const_int 1))
+                      (label_ref (match_operand 1 "" ""))
+                      (pc)))
+   (set (match_operand:SI 0 "nonimmediate_operand" "=a")
+        (plus (match_dup 2)
+              (const_int -1)))
+   (unspec [(const_int 0)] UNSPEC_LSETUP_END)]
   ""
 {
-    xtensa_emit_loop_end (insn, operands);
-    return "";
+  xtensa_emit_loop_end (insn, operands);
+  return "";
 }
   [(set_attr "type"    "jump")
    (set_attr "mode"    "none")
    (set_attr "length"    "0")])

+; operand 0 is the loop count pseudo register
+; operand 1 is the label to jump to at the top of the loop
+(define_expand "doloop_end"
+  [(parallel [(set (pc) (if_then_else
+                          (ne (match_operand:SI 0 "" "")
+                              (const_int 1))
+                          (label_ref (match_operand 1 "" ""))
+                          (pc)))
+              (set (match_dup 0)
+                   (plus:SI (match_dup 0)
+                            (const_int -1)))
+              (unspec [(const_int 0)] UNSPEC_LSETUP_END)])]
+  ""
+{
+  /* The loop optimizer doesn't check the predicates... */
+  if (GET_MODE (operands[0]) != SImode)
+    FAIL;
+})
+

 ;; Setting a register from a comparison.

Index: gcc/config/xtensa/xtensa.c
===================================================================
--- gcc/config/xtensa/xtensa.c    (revision 206431)
+++ gcc/config/xtensa/xtensa.c    (working copy)
@@ -1,6 +1,7 @@ 
 /* Subroutines for insn-output.c for Tensilica's Xtensa architecture.
    Copyright (C) 2001-2014 Free Software Foundation, Inc.
    Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
+   Zero-overhead looping support by Felix Yang (felix.yang0953@gmail.com).

 This file is part of GCC.

@@ -61,8 +62,9 @@  along with GCC; see the file COPYING3.  If not see
 #include "gimple.h"
 #include "gimplify.h"
 #include "df.h"
+#include "hw-doloop.h"
+#include "dumpfile.h"

-
 /* Enumeration for all of the relational tests, so that we can build
    arrays indexed by the test type, and not worry about the order
    of EQ, NE, etc.  */
@@ -186,6 +188,10 @@  static reg_class_t xtensa_secondary_reload (bool,

 static bool constantpool_address_p (const_rtx addr);
 static bool xtensa_legitimate_constant_p (enum machine_mode, rtx);
+static void xtensa_reorg (void);
+static bool xtensa_can_use_doloop_p (double_int, double_int iterations_max,
+                                     unsigned int, bool);
+static const char *xtensa_invalid_within_doloop (const_rtx);

 static bool xtensa_member_type_forces_blk (const_tree,
                        enum machine_mode mode);
@@ -312,6 +318,15 @@  static const int reg_nonleaf_alloc_order[FIRST_PSE
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p

+#undef TARGET_MACHINE_DEPENDENT_REORG
+#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg
+
+#undef TARGET_CAN_USE_DOLOOP_P
+#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p
+
+#undef TARGET_INVALID_WITHIN_DOLOOP
+#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop
+
 struct gcc_target targetm = TARGET_INITIALIZER;


@@ -1676,7 +1691,7 @@  xtensa_emit_loop_end (rtx insn, rtx *operands)
         }
     }

-  output_asm_insn ("# loop end for %0", operands);
+  output_asm_insn ("%1_LEND:", operands);
 }


@@ -3709,4 +3724,224 @@  xtensa_legitimate_constant_p (enum machine_mode mo
   return !xtensa_tls_referenced_p (x);
 }

+/* Implement TARGET_CAN_USE_DOLOOP_P.  */
+
+static bool
+xtensa_can_use_doloop_p (double_int, double_int,
+                         unsigned int level, bool entered_at_top)
+{
+  /* Considering limitations in the hardware, only use doloop for
innermost loops
+     which must be entered from the top.  */
+  if (level != 1 || !entered_at_top)
+    return false;
+
+  return true;
+}
+
+/* NULL if INSN insn is valid within a low-overhead loop.
+   Otherwise return why doloop cannot be applied.  */
+
+static const char *
+xtensa_invalid_within_doloop (const_rtx insn)
+{
+  if (CALL_P (insn))
+    return "Function call in the loop.";
+
+  return NULL;
+}
+
+/* Optimize LOOP.  */
+
+static bool
+hwloop_optimize (hwloop_info loop)
+{
+  int i;
+  edge entry_edge;
+  basic_block entry_bb;
+  rtx insn, seq, iter_reg, entry_after;
+
+  if (loop->depth > 1)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not innermost\n", loop->loop_no);
+      return false;
+    }
+
+  if (!loop->incoming_dest)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has more than one entry\n",
loop->loop_no);
+      return false;
+    }
+
+  if (loop->incoming_dest != loop->head)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d is not entered from head\n",
loop->loop_no);
+      return false;
+    }
+
+  if (loop->has_call || loop->has_asm)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d has invalid insn\n", loop->loop_no);
+      return false;
+    }
+
+  /* Scan all the blocks to make sure they don't use iter_reg.  */
+  if (loop->iter_reg_used || loop->iter_reg_used_outside)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d uses iterator\n", loop->loop_no);
+      return false;
+    }
+
+  /* Check if start_label appears before doloop_end.  */
+  insn = loop->start_label;
+  while (insn && insn != loop->loop_end)
+    insn = NEXT_INSN (insn);
+
+  if (!insn)
+    {
+      if (dump_file)
+        fprintf (dump_file, ";; loop %d start_label not before loop_end\n",
+                 loop->loop_no);
+      return false;
+    }
+
+  /* Get the loop iteration register.  */
+  iter_reg = loop->iter_reg;
+
+  gcc_assert (REG_P (iter_reg));
+
+  entry_edge = NULL;
+
+  FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge)
+    if (entry_edge->flags & EDGE_FALLTHRU)
+      break;
+
+  if (entry_edge == NULL)
+    return false;
+
+  /* Place the zero_cost_loop_start instruction before the loop.  */
+  entry_bb = entry_edge->src;
+
+  start_sequence ();
+
+  insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg,
+                                              loop->start_label,
+                                              loop->iter_reg));
+
+  seq = get_insns ();
+
+  if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1)
+    {
+      basic_block new_bb;
+      edge e;
+      edge_iterator ei;
+
+      emit_insn_before (seq, BB_HEAD (loop->head));
+      seq = emit_label_before (gen_label_rtx (), seq);
+
+      new_bb = create_basic_block (seq, insn, entry_bb);
+      FOR_EACH_EDGE (e, ei, loop->incoming)
+        {
+          if (!(e->flags & EDGE_FALLTHRU))
+            redirect_edge_and_branch_force (e, new_bb);
+          else
+            redirect_edge_succ (e, new_bb);
+        }
+      make_edge (new_bb, loop->head, 0);
+    }
+  else
+    {
+      entry_after = BB_END (entry_bb);
+      while (DEBUG_INSN_P (entry_after)
+             || (NOTE_P (entry_after)
+                 && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK))
+        entry_after = PREV_INSN (entry_after);
+      emit_insn_after (seq, entry_after);
+    }
+
+  end_sequence ();
+
+  return true;
+}
+
+/* A callback for the hw-doloop pass.  Called when a loop we have discovered
+   turns out not to be optimizable; we have to split the loop_end pattern into
+   a subtract and a test.  */
+
+static void
+hwloop_fail (hwloop_info loop)
+{
+  rtx test, insn = loop->loop_end;
+
+  emit_insn_before (gen_addsi3 (loop->iter_reg,
+                                loop->iter_reg,
+                                constm1_rtx),
+                    loop->loop_end);
+
+  test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx);
+  insn = emit_jump_insn_before (gen_cbranchsi4 (test,
+                                                loop->iter_reg, const0_rtx,
+                                                loop->start_label),
+                                loop->loop_end);
+
+  JUMP_LABEL (insn) = loop->start_label;
+  LABEL_NUSES (loop->start_label)++;
+  delete_insn (loop->loop_end);
+}
+
+/* A callback for the hw-doloop pass.  This function examines INSN; if
+   it is a doloop_end pattern we recognize, return the reg rtx for the
+   loop counter.  Otherwise, return NULL_RTX.  */
+
+static rtx
+hwloop_pattern_reg (rtx insn)
+{
+  rtx reg;
+
+  if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_zero_cost_loop_end)
+    return NULL_RTX;
+
+  reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1));
+  if (!REG_P (reg))
+    return NULL_RTX;
+  return reg;
+}
+
+
+static struct hw_doloop_hooks xtensa_doloop_hooks =
+{
+  hwloop_pattern_reg,
+  hwloop_optimize,
+  hwloop_fail
+};
+
+/* Run from machine_dependent_reorg, this pass looks for doloop_end insns
+   and tries to rewrite the RTL of these loops so that proper Xtensa
+   hardware loops are generated.  */
+
+static void
+xtensa_reorg_loops (void)
+{
+  reorg_loops (true, &xtensa_doloop_hooks);
+}
+
+/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass.  */
+
+static void
+xtensa_reorg (void)
+{
+  /* We are freeing block_for_insn in the toplev to keep compatibility
+     with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
+  compute_bb_for_insn ();
+
+  df_analyze ();
+
+  /* Doloop optimization.  */
+  xtensa_reorg_loops ();