Message ID | CAFc0fxwXVkSf0---P42kdXWR7ibr02uOaiiz8L9Kc=fHTxWY1w@mail.gmail.com |
---|---|
State | New |
Headers | show |
PING? > > Hi Sterling, > > I made some improvement to the patch. Two changes: > 1. TARGET_LOOPS is now used as a condition of the doloop related > patterns, which is more elegant. > 2. As the trip count register of the zero-cost loop maybe potentially spilled, > we need to change the patterns in order to handle this issue. The solution is > similar to that adapted by c6x backend. > Just turn the zero-cost loop into a regular loop when that happens when reload > is completed. > Attached please find version 4 of the patch. Make check regression tested > with xtensa-elf-gcc/simulator. > OK for trunk? > > Index: gcc/ChangeLog > ================================================================ > === > --- gcc/ChangeLog (revision 216079) > +++ gcc/ChangeLog (working copy) > @@ -1,3 +1,20 @@ > +2014-10-10 Felix Yang <felix.yang@huawei.com> > + > + * config/xtensa/xtensa.h (TARGET_LOOPS): New Macro. > + * config/xtensa/xtensa.c (xtensa_reorg): New. > + (xtensa_reorg_loops): New. > + (xtensa_can_use_doloop_p): New. > + (xtensa_invalid_within_doloop): New. > + (hwloop_optimize): New. > + (hwloop_fail): New. > + (hwloop_pattern_reg): New. > + (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end > label. > + (xtensa_doloop_hooks): Define. > + * config/xtensa/xtensa.md (doloop_end): New. > + (loop_end): New > + (zero_cost_loop_start): Rewritten. > + (zero_cost_loop_end): Rewritten. > + > 2014-10-10 Kyrylo Tkachov <kyrylo.tkachov@arm.com> > > * configure.ac: Add --enable-fix-cortex-a53-835769 option. > Index: gcc/config/xtensa/xtensa.md > ================================================================ > === > --- gcc/config/xtensa/xtensa.md (revision 216079) > +++ gcc/config/xtensa/xtensa.md (working copy) > @@ -35,6 +35,8 @@ > (UNSPEC_TLS_CALL 9) > (UNSPEC_TP 10) > (UNSPEC_MEMW 11) > + (UNSPEC_LSETUP_START 12) > + (UNSPEC_LSETUP_END 13) > > (UNSPECV_SET_FP 1) > (UNSPECV_ENTRY 2) > @@ -1289,41 +1291,120 @@ > (set_attr "length" "3")]) > > > +;; Zero-overhead looping support. > + > ;; Define the loop insns used by bct optimization to represent the -;; start and > end of a zero-overhead loop (in loop.c). This start -;; template generates the > loop insn; the end template doesn't generate -;; any instructions since loop end > is handled in hardware. > +;; start and end of a zero-overhead loop. This start template > +generates ;; the loop insn; the end template doesn't generate any > +instructions since ;; loop end is handled in hardware. > > (define_insn "zero_cost_loop_start" > [(set (pc) > - (if_then_else (eq (match_operand:SI 0 "register_operand" "a") > - (const_int 0)) > - (label_ref (match_operand 1 "" "")) > - (pc))) > - (set (reg:SI 19) > - (plus:SI (match_dup 0) (const_int -1)))] > - "" > - "loopnez\t%0, %l1" > + (if_then_else (ne (match_operand:SI 0 "register_operand" "2") > + (const_int 1)) > + (label_ref (match_operand 1 "" "")) > + (pc))) > + (set (match_operand:SI 2 "register_operand" "=a") > + (plus (match_dup 0) > + (const_int -1))) > + (unspec [(const_int 0)] UNSPEC_LSETUP_START)] "TARGET_LOOPS && > + optimize" > + "loop\t%0, %l1_LEND" > [(set_attr "type" "jump") > (set_attr "mode" "none") > (set_attr "length" "3")]) > > (define_insn "zero_cost_loop_end" > [(set (pc) > - (if_then_else (ne (reg:SI 19) (const_int 0)) > - (label_ref (match_operand 0 "" "")) > - (pc))) > - (set (reg:SI 19) > - (plus:SI (reg:SI 19) (const_int -1)))] > - "" > + (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" > "2,2") > + (const_int 1)) > + (label_ref (match_operand 1 "" "")) > + (pc))) > + (set (match_operand:SI 2 "nonimmediate_operand" "=a,m") > + (plus (match_dup 0) > + (const_int -1))) > + (unspec [(const_int 0)] UNSPEC_LSETUP_END) > + (clobber (match_scratch:SI 3 "=X,&r"))] "TARGET_LOOPS && optimize" > + "#" > + [(set_attr "type" "jump") > + (set_attr "mode" "none") > + (set_attr "length" "0")]) > + > +(define_insn "loop_end" > + [(set (pc) > + (if_then_else (ne (match_operand:SI 0 "register_operand" "2") > + (const_int 1)) > + (label_ref (match_operand 1 "" "")) > + (pc))) > + (set (match_operand:SI 2 "register_operand" "=a") > + (plus (match_dup 0) > + (const_int -1))) > + (unspec [(const_int 0)] UNSPEC_LSETUP_END)] > + "TARGET_LOOPS && optimize" > { > - xtensa_emit_loop_end (insn, operands); > - return ""; > + xtensa_emit_loop_end (insn, operands); return ""; > } > [(set_attr "type" "jump") > (set_attr "mode" "none") > (set_attr "length" "0")]) > > +(define_split > + [(set (pc) > + (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "") > + (const_int 1)) > + (label_ref (match_operand 1 "" "")) > + (pc))) > + (set (match_operand:SI 2 "nonimmediate_operand" "") > + (plus:SI (match_dup 0) > + (const_int -1))) > + (unspec [(const_int 0)] UNSPEC_LSETUP_END) > + (clobber (match_scratch 3))] > + "TARGET_LOOPS && optimize && reload_completed" > + [(const_int 0)] > +{ > + if (!REG_P (operands[0])) > + { > + rtx test; > + > + /* Fallback into a normal conditional branch insn. */ > + emit_move_insn (operands[3], operands[0]); > + emit_insn (gen_addsi3 (operands[3], operands[3], constm1_rtx)); > + emit_move_insn (operands[0], operands[3]); > + test = gen_rtx_NE (VOIDmode, operands[3], const0_rtx); > + emit_jump_insn (gen_cbranchsi4 (test, operands[3], > + const0_rtx, operands[1])); > + } > + else > + { > + emit_jump_insn (gen_loop_end (operands[0], operands[1], > operands[2])); > + } > + > + DONE; > +}) > + > +; operand 0 is the loop count pseudo register ; operand 1 is the label > +to jump to at the top of the loop (define_expand "doloop_end" > + [(parallel [(set (pc) (if_then_else > + (ne (match_operand:SI 0 "" "") > + (const_int 1)) > + (label_ref (match_operand 1 "" "")) > + (pc))) > + (set (match_dup 0) > + (plus:SI (match_dup 0) > + (const_int -1))) > + (unspec [(const_int 0)] UNSPEC_LSETUP_END) > + (clobber (match_dup 2))])] ; match_scratch > + "TARGET_LOOPS && optimize" > +{ > + /* The loop optimizer doesn't check the predicates... */ > + if (GET_MODE (operands[0]) != SImode) > + FAIL; > + operands[2] = gen_rtx_SCRATCH (SImode); > +}) > + > > ;; Setting a register from a comparison. > > Index: gcc/config/xtensa/xtensa.c > ================================================================ > === > --- gcc/config/xtensa/xtensa.c (revision 216079) > +++ gcc/config/xtensa/xtensa.c (working copy) > @@ -61,6 +61,8 @@ along with GCC; see the file COPYING3. If not see > #include "gimplify.h" > #include "df.h" > #include "builtins.h" > +#include "dumpfile.h" > +#include "hw-doloop.h" > > > /* Enumeration for all of the relational tests, so that we can build @@ -186,6 > +188,10 @@ static reg_class_t xtensa_secondary_reload (bool, > > static bool constantpool_address_p (const_rtx addr); static bool > xtensa_legitimate_constant_p (enum machine_mode, rtx); > +static void xtensa_reorg (void); > +static bool xtensa_can_use_doloop_p (const widest_int &, const widest_int > &, > + unsigned int, bool); static const > +char *xtensa_invalid_within_doloop (const rtx_insn *); > > static bool xtensa_member_type_forces_blk (const_tree, > enum machine_mode mode); @@ -312,6 > +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE #undef > TARGET_LEGITIMATE_CONSTANT_P #define > TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p > > +#undef TARGET_MACHINE_DEPENDENT_REORG > +#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg > + > +#undef TARGET_CAN_USE_DOLOOP_P > +#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p > + > +#undef TARGET_INVALID_WITHIN_DOLOOP > +#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop > + > struct gcc_target targetm = TARGET_INITIALIZER; > > > @@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx_insn *insn, rtx *operand > } > } > > - output_asm_insn ("# loop end for %0", operands); > + output_asm_insn ("%1_LEND:", operands); > } > > > @@ -3712,4 +3727,236 @@ xtensa_legitimate_constant_p (enum > machine_mode mo > return !xtensa_tls_referenced_p (x); > } > > +/* Implement TARGET_CAN_USE_DOLOOP_P. */ > + > +static bool > +xtensa_can_use_doloop_p (const widest_int &, const widest_int &, > + unsigned int loop_depth, bool entered_at_top) > +{ > + /* Considering limitations in the hardware, only use doloop > + for innermost loops which must be entered from the top. */ > + if (loop_depth > 1 || !entered_at_top) > + return false; > + > + return true; > +} > + > +/* NULL if INSN insn is valid within a low-overhead loop. > + Otherwise return why doloop cannot be applied. */ > + > +static const char * > +xtensa_invalid_within_doloop (const rtx_insn *insn) { > + if (CALL_P (insn)) > + return "Function call in the loop."; > + > + if (JUMP_P (insn) && INSN_CODE (insn) == CODE_FOR_return) > + return "Return from a call instruction in the loop."; > + > + return NULL; > +} > + > +/* Optimize LOOP. */ > + > +static bool > +hwloop_optimize (hwloop_info loop) > +{ > + int i; > + edge entry_edge; > + basic_block entry_bb; > + rtx iter_reg; > + rtx_insn *insn, *seq, *entry_after; > + > + if (loop->depth > 1) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d is not innermost\n", > + loop->loop_no); > + return false; > + } > + > + if (!loop->incoming_dest) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d has more than one entry\n", > + loop->loop_no); > + return false; > + } > + > + if (loop->incoming_dest != loop->head) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d is not entered from head\n", > + loop->loop_no); > + return false; > + } > + > + if (loop->has_call || loop->has_asm) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d has invalid insn\n", > + loop->loop_no); > + return false; > + } > + > + /* Scan all the blocks to make sure they don't use iter_reg. */ if > + (loop->iter_reg_used || loop->iter_reg_used_outside) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d uses iterator\n", > + loop->loop_no); > + return false; > + } > + > + /* Check if start_label appears before doloop_end. */ insn = > + loop->start_label; while (insn && insn != loop->loop_end) > + insn = NEXT_INSN (insn); > + > + if (!insn) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d start_label not before loop_end\n", > + loop->loop_no); > + return false; > + } > + > + /* Get the loop iteration register. */ iter_reg = loop->iter_reg; > + > + gcc_assert (REG_P (iter_reg)); > + > + entry_edge = NULL; > + > + FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge) > + if (entry_edge->flags & EDGE_FALLTHRU) > + break; > + > + if (entry_edge == NULL) > + return false; > + > + /* Place the zero_cost_loop_start instruction before the loop. */ > + entry_bb = entry_edge->src; > + > + start_sequence (); > + > + insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg, > + loop->start_label, > + loop->iter_reg)); > + > + seq = get_insns (); > + > + if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1) > + { > + basic_block new_bb; > + edge e; > + edge_iterator ei; > + > + emit_insn_before (seq, BB_HEAD (loop->head)); > + seq = emit_label_before (gen_label_rtx (), seq); > + new_bb = create_basic_block (seq, insn, entry_bb); > + FOR_EACH_EDGE (e, ei, loop->incoming) > + { > + if (!(e->flags & EDGE_FALLTHRU)) > + redirect_edge_and_branch_force (e, new_bb); > + else > + redirect_edge_succ (e, new_bb); > + } > + > + make_edge (new_bb, loop->head, 0); > + } > + else > + { > + entry_after = BB_END (entry_bb); > + while (DEBUG_INSN_P (entry_after) > + || (NOTE_P (entry_after) > + && NOTE_KIND (entry_after) != > NOTE_INSN_BASIC_BLOCK)) > + entry_after = PREV_INSN (entry_after); > + > + emit_insn_after (seq, entry_after); > + } > + > + end_sequence (); > + > + return true; > +} > + > +/* A callback for the hw-doloop pass. Called when a loop we have discovered > + turns out not to be optimizable; we have to split the loop_end pattern into > + a subtract and a test. */ > + > +static void > +hwloop_fail (hwloop_info loop) > +{ > + rtx test; > + rtx_insn *insn = loop->loop_end; > + > + emit_insn_before (gen_addsi3 (loop->iter_reg, > + loop->iter_reg, > + constm1_rtx), > + loop->loop_end); > + > + test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx); insn = > + emit_jump_insn_before (gen_cbranchsi4 (test, > + loop->iter_reg, > const0_rtx, > + loop->start_label), > + loop->loop_end); > + > + JUMP_LABEL (insn) = loop->start_label; > + LABEL_NUSES (loop->start_label)++; > + delete_insn (loop->loop_end); > +} > + > +/* A callback for the hw-doloop pass. This function examines INSN; if > + it is a doloop_end pattern we recognize, return the reg rtx for the > + loop counter. Otherwise, return NULL_RTX. */ > + > +static rtx > +hwloop_pattern_reg (rtx_insn *insn) > +{ > + rtx reg; > + > + if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_loop_end) > + return NULL_RTX; > + > + reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1)); if (!REG_P (reg)) > + return NULL_RTX; > + > + return reg; > +} > + > + > +static struct hw_doloop_hooks xtensa_doloop_hooks = { > + hwloop_pattern_reg, > + hwloop_optimize, > + hwloop_fail > +}; > + > +/* Run from machine_dependent_reorg, this pass looks for doloop_end insns > + and tries to rewrite the RTL of these loops so that proper Xtensa > + hardware loops are generated. */ > + > +static void > +xtensa_reorg_loops (void) > +{ > + reorg_loops (false, &xtensa_doloop_hooks); } > + > +/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass. */ > + > +static void > +xtensa_reorg (void) > +{ > + /* We are freeing block_for_insn in the toplev to keep compatibility > + with old MDEP_REORGS that are not CFG based. Recompute it now. > +*/ > + compute_bb_for_insn (); > + > + df_analyze (); > + > + /* Doloop optimization. */ > + xtensa_reorg_loops (); > +} > + > #include "gt-xtensa.h" > Index: gcc/config/xtensa/xtensa.h > ================================================================ > === > --- gcc/config/xtensa/xtensa.h (revision 216079) > +++ gcc/config/xtensa/xtensa.h (working copy) > @@ -61,6 +61,7 @@ extern unsigned xtensa_current_frame_size; > #define TARGET_S32C1I XCHAL_HAVE_S32C1I > #define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS > #define TARGET_THREADPTR XCHAL_HAVE_THREADPTR > +#define TARGET_LOOPS XCHAL_HAVE_LOOPS > > #define TARGET_DEFAULT \ > ((XCHAL_HAVE_L32R ? 0 : MASK_CONST16) | \ > > Cheers, > Felix > > > On Thu, Oct 9, 2014 at 6:52 PM, Felix Yang <fei.yang0953@gmail.com> wrote: > > Hello Sterling, > > > > My paper work with the FSF has finished and we can now move > > forward with this patch :-) > > I rebased the patch on the latest trunk. Attached please find > > version 3 of the patch. > > And the enclosed patch also includes the two points pointed by > > you, do you like it? > > Make check regression tested with xtensa-elf-gcc built from trunk > > with this patch. > > OK to apply? > > > > Index: gcc/ChangeLog > > > ================================================================ > === > > --- gcc/ChangeLog (revision 216036) > > +++ gcc/ChangeLog (working copy) > > @@ -1,3 +1,19 @@ > > +2014-10-09 Felix Yang <felix.yang@huawei.com> > > + > > + * config/xtensa/xtensa.h (TARGET_LOOPS): New Macro. > > + * config/xtensa/xtensa.c (xtensa_reorg): New. > > + (xtensa_reorg_loops): New. > > + (xtensa_can_use_doloop_p): New. > > + (xtensa_invalid_within_doloop): New. > > + (hwloop_optimize): New. > > + (hwloop_fail): New. > > + (hwloop_pattern_reg): New. > > + (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end > label. > > + (xtensa_doloop_hooks): Define. > > + * config/xtensa/xtensa.md (doloop_end): New. > > + (zero_cost_loop_start): Rewritten. > > + (zero_cost_loop_end): Rewritten. > > + > > 2014-10-09 Joern Rennecke <joern.rennecke@embecosm.com> > > > > * config/avr/avr.opt (mmcu=): Change to have a string value. > > Index: gcc/config/xtensa/xtensa.md > > > ================================================================ > === > > --- gcc/config/xtensa/xtensa.md (revision 216036) > > +++ gcc/config/xtensa/xtensa.md (working copy) > > @@ -35,6 +35,8 @@ > > (UNSPEC_TLS_CALL 9) > > (UNSPEC_TP 10) > > (UNSPEC_MEMW 11) > > + (UNSPEC_LSETUP_START 12) > > + (UNSPEC_LSETUP_END 13) > > > > (UNSPECV_SET_FP 1) > > (UNSPECV_ENTRY 2) > > @@ -1289,41 +1291,67 @@ > > (set_attr "length" "3")]) > > > > > > +;; Zero-overhead looping support. > > + > > ;; Define the loop insns used by bct optimization to represent the > > -;; start and end of a zero-overhead loop (in loop.c). This start -;; > > template generates the loop insn; the end template doesn't generate > > -;; any instructions since loop end is handled in hardware. > > +;; start and end of a zero-overhead loop. This start template > > +generates ;; the loop insn; the end template doesn't generate any > > +instructions since ;; loop end is handled in hardware. > > > > (define_insn "zero_cost_loop_start" > > [(set (pc) > > - (if_then_else (eq (match_operand:SI 0 "register_operand" "a") > > - (const_int 0)) > > - (label_ref (match_operand 1 "" "")) > > - (pc))) > > - (set (reg:SI 19) > > - (plus:SI (match_dup 0) (const_int -1)))] > > + (if_then_else (ne (match_operand:SI 0 "register_operand" "a") > > + (const_int 1)) > > + (label_ref (match_operand 1 "" "")) > > + (pc))) > > + (set (match_operand:SI 2 "register_operand" "+a0") > > + (plus (match_dup 2) > > + (const_int -1))) > > + (unspec [(const_int 0)] UNSPEC_LSETUP_START)] > > "" > > - "loopnez\t%0, %l1" > > + "loop\t%0, %l1_LEND" > > [(set_attr "type" "jump") > > (set_attr "mode" "none") > > (set_attr "length" "3")]) > > > > (define_insn "zero_cost_loop_end" > > [(set (pc) > > - (if_then_else (ne (reg:SI 19) (const_int 0)) > > - (label_ref (match_operand 0 "" "")) > > - (pc))) > > - (set (reg:SI 19) > > - (plus:SI (reg:SI 19) (const_int -1)))] > > + (if_then_else (ne (match_operand:SI 0 "register_operand" "a") > > + (const_int 1)) > > + (label_ref (match_operand 1 "" "")) > > + (pc))) > > + (set (match_operand:SI 2 "register_operand" "+a0") > > + (plus (match_dup 2) > > + (const_int -1))) > > + (unspec [(const_int 0)] UNSPEC_LSETUP_END)] > > "" > > { > > - xtensa_emit_loop_end (insn, operands); > > - return ""; > > + xtensa_emit_loop_end (insn, operands); return ""; > > } > > [(set_attr "type" "jump") > > (set_attr "mode" "none") > > (set_attr "length" "0")]) > > > > +; operand 0 is the loop count pseudo register ; operand 1 is the > > +label to jump to at the top of the loop (define_expand "doloop_end" > > + [(parallel [(set (pc) (if_then_else > > + (ne (match_operand:SI 0 "" "") > > + (const_int 1)) > > + (label_ref (match_operand 1 "" "")) > > + (pc))) > > + (set (match_dup 0) > > + (plus:SI (match_dup 0) > > + (const_int -1))) > > + (unspec [(const_int 0)] UNSPEC_LSETUP_END)])] > > + "" > > +{ > > + /* The loop optimizer doesn't check the predicates... */ > > + if (GET_MODE (operands[0]) != SImode) > > + FAIL; > > +}) > > + > > > > ;; Setting a register from a comparison. > > > > Index: gcc/config/xtensa/xtensa.c > > > ================================================================ > === > > --- gcc/config/xtensa/xtensa.c (revision 216036) > > +++ gcc/config/xtensa/xtensa.c (working copy) > > @@ -61,6 +61,8 @@ along with GCC; see the file COPYING3. If not see > > #include "gimplify.h" > > #include "df.h" > > #include "builtins.h" > > +#include "dumpfile.h" > > +#include "hw-doloop.h" > > > > > > /* Enumeration for all of the relational tests, so that we can build > > @@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool, > > > > static bool constantpool_address_p (const_rtx addr); > > static bool xtensa_legitimate_constant_p (enum machine_mode, rtx); > > +static void xtensa_reorg (void); > > +static bool xtensa_can_use_doloop_p (const widest_int &, const widest_int > &, > > + unsigned int, bool); > > +static const char *xtensa_invalid_within_doloop (const rtx_insn *); > > > > static bool xtensa_member_type_forces_blk (const_tree, > > enum machine_mode mode); > > @@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE > > #undef TARGET_LEGITIMATE_CONSTANT_P > > #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p > > > > +#undef TARGET_MACHINE_DEPENDENT_REORG > > +#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg > > + > > +#undef TARGET_CAN_USE_DOLOOP_P > > +#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p > > + > > +#undef TARGET_INVALID_WITHIN_DOLOOP > > +#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop > > + > > struct gcc_target targetm = TARGET_INITIALIZER; > > > > > > @@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx_insn *insn, rtx > *operand > > } > > } > > > > - output_asm_insn ("# loop end for %0", operands); > > + output_asm_insn ("%1_LEND:", operands); > > } > > > > > > @@ -3712,4 +3727,239 @@ xtensa_legitimate_constant_p (enum > machine_mode mo > > return !xtensa_tls_referenced_p (x); > > } > > > > +/* Implement TARGET_CAN_USE_DOLOOP_P. */ > > + > > +static bool > > +xtensa_can_use_doloop_p (const widest_int &, const widest_int &, > > + unsigned int loop_depth, bool > entered_at_top) > > +{ > > + if (!TARGET_LOOPS) > > + return false; > > + > > + /* Considering limitations in the hardware, only use doloop > > + for innermost loops which must be entered from the top. */ > > + if (loop_depth > 1 || !entered_at_top) > > + return false; > > + > > + return true; > > +} > > + > > +/* NULL if INSN insn is valid within a low-overhead loop. > > + Otherwise return why doloop cannot be applied. */ > > + > > +static const char * > > +xtensa_invalid_within_doloop (const rtx_insn *insn) > > +{ > > + if (CALL_P (insn)) > > + return "Function call in the loop."; > > + > > + if (JUMP_P (insn) && INSN_CODE (insn) == CODE_FOR_return) > > + return "Return from a call instruction in the loop."; > > + > > + return NULL; > > +} > > + > > +/* Optimize LOOP. */ > > + > > +static bool > > +hwloop_optimize (hwloop_info loop) > > +{ > > + int i; > > + edge entry_edge; > > + basic_block entry_bb; > > + rtx iter_reg; > > + rtx_insn *insn, *seq, *entry_after; > > + > > + if (loop->depth > 1) > > + { > > + if (dump_file) > > + fprintf (dump_file, ";; loop %d is not innermost\n", > > + loop->loop_no); > > + return false; > > + } > > + > > + if (!loop->incoming_dest) > > + { > > + if (dump_file) > > + fprintf (dump_file, ";; loop %d has more than one entry\n", > > + loop->loop_no); > > + return false; > > + } > > + > > + if (loop->incoming_dest != loop->head) > > + { > > + if (dump_file) > > + fprintf (dump_file, ";; loop %d is not entered from head\n", > > + loop->loop_no); > > + return false; > > + } > > + > > + if (loop->has_call || loop->has_asm) > > + { > > + if (dump_file) > > + fprintf (dump_file, ";; loop %d has invalid insn\n", > > + loop->loop_no); > > + return false; > > + } > > + > > + /* Scan all the blocks to make sure they don't use iter_reg. */ > > + if (loop->iter_reg_used || loop->iter_reg_used_outside) > > + { > > + if (dump_file) > > + fprintf (dump_file, ";; loop %d uses iterator\n", > > + loop->loop_no); > > + return false; > > + } > > + > > + /* Check if start_label appears before doloop_end. */ > > + insn = loop->start_label; > > + while (insn && insn != loop->loop_end) > > + insn = NEXT_INSN (insn); > > + > > + if (!insn) > > + { > > + if (dump_file) > > + fprintf (dump_file, ";; loop %d start_label not before loop_end\n", > > + loop->loop_no); > > + return false; > > + } > > + > > + /* Get the loop iteration register. */ > > + iter_reg = loop->iter_reg; > > + > > + gcc_assert (REG_P (iter_reg)); > > + > > + entry_edge = NULL; > > + > > + FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge) > > + if (entry_edge->flags & EDGE_FALLTHRU) > > + break; > > + > > + if (entry_edge == NULL) > > + return false; > > + > > + /* Place the zero_cost_loop_start instruction before the loop. */ > > + entry_bb = entry_edge->src; > > + > > + start_sequence (); > > + > > + insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg, > > + loop->start_label, > > + loop->iter_reg)); > > + > > + seq = get_insns (); > > + > > + if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1) > > + { > > + basic_block new_bb; > > + edge e; > > + edge_iterator ei; > > + > > + emit_insn_before (seq, BB_HEAD (loop->head)); > > + seq = emit_label_before (gen_label_rtx (), seq); > > + new_bb = create_basic_block (seq, insn, entry_bb); > > + FOR_EACH_EDGE (e, ei, loop->incoming) > > + { > > + if (!(e->flags & EDGE_FALLTHRU)) > > + redirect_edge_and_branch_force (e, new_bb); > > + else > > + redirect_edge_succ (e, new_bb); > > + } > > + > > + make_edge (new_bb, loop->head, 0); > > + } > > + else > > + { > > + entry_after = BB_END (entry_bb); > > + while (DEBUG_INSN_P (entry_after) > > + || (NOTE_P (entry_after) > > + && NOTE_KIND (entry_after) != > NOTE_INSN_BASIC_BLOCK)) > > + entry_after = PREV_INSN (entry_after); > > + > > + emit_insn_after (seq, entry_after); > > + } > > + > > + end_sequence (); > > + > > + return true; > > +} > > + > > +/* A callback for the hw-doloop pass. Called when a loop we have > discovered > > + turns out not to be optimizable; we have to split the loop_end pattern > into > > + a subtract and a test. */ > > + > > +static void > > +hwloop_fail (hwloop_info loop) > > +{ > > + rtx test; > > + rtx_insn *insn = loop->loop_end; > > + > > + emit_insn_before (gen_addsi3 (loop->iter_reg, > > + loop->iter_reg, > > + constm1_rtx), > > + loop->loop_end); > > + > > + test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx); > > + insn = emit_jump_insn_before (gen_cbranchsi4 (test, > > + loop->iter_reg, > const0_rtx, > > + > loop->start_label), > > + loop->loop_end); > > + > > + JUMP_LABEL (insn) = loop->start_label; > > + LABEL_NUSES (loop->start_label)++; > > + delete_insn (loop->loop_end); > > +} > > + > > +/* A callback for the hw-doloop pass. This function examines INSN; if > > + it is a doloop_end pattern we recognize, return the reg rtx for the > > + loop counter. Otherwise, return NULL_RTX. */ > > + > > +static rtx > > +hwloop_pattern_reg (rtx_insn *insn) > > +{ > > + rtx reg; > > + > > + if (!JUMP_P (insn) || recog_memoized (insn) != > CODE_FOR_zero_cost_loop_end) > > + return NULL_RTX; > > + > > + reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1)); > > + if (!REG_P (reg)) > > + return NULL_RTX; > > + > > + return reg; > > +} > > + > > + > > +static struct hw_doloop_hooks xtensa_doloop_hooks = > > +{ > > + hwloop_pattern_reg, > > + hwloop_optimize, > > + hwloop_fail > > +}; > > + > > +/* Run from machine_dependent_reorg, this pass looks for doloop_end > insns > > + and tries to rewrite the RTL of these loops so that proper Xtensa > > + hardware loops are generated. */ > > + > > +static void > > +xtensa_reorg_loops (void) > > +{ > > + reorg_loops (false, &xtensa_doloop_hooks); > > +} > > + > > +/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass. */ > > + > > +static void > > +xtensa_reorg (void) > > +{ > > + /* We are freeing block_for_insn in the toplev to keep compatibility > > + with old MDEP_REORGS that are not CFG based. Recompute it now. > */ > > + compute_bb_for_insn (); > > + > > + df_analyze (); > > + > > + /* Doloop optimization. */ > > + xtensa_reorg_loops (); > > +} > > + > > #include "gt-xtensa.h" > > Index: gcc/config/xtensa/xtensa.h > > > ================================================================ > === > > --- gcc/config/xtensa/xtensa.h (revision 216036) > > +++ gcc/config/xtensa/xtensa.h (working copy) > > @@ -61,6 +61,7 @@ extern unsigned xtensa_current_frame_size; > > #define TARGET_S32C1I XCHAL_HAVE_S32C1I > > #define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS > > #define TARGET_THREADPTR XCHAL_HAVE_THREADPTR > > +#define TARGET_LOOPS XCHAL_HAVE_LOOPS > > > > #define TARGET_DEFAULT \ > > ((XCHAL_HAVE_L32R ? 0 : MASK_CONST16) | \ > > Cheers, > > Felix > > > > > > On Tue, Jan 14, 2014 at 1:23 AM, Sterling Augustine > > <augustine.sterling@gmail.com> wrote: > >> On Thu, Jan 9, 2014 at 7:48 PM, Yangfei (Felix) <felix.yang@huawei.com> > wrote: > >>> And here is the xtensa configuration tested (include/xtensa-config.h): > >>> > >>> #define XCHAL_HAVE_BE 0 > >>> #define XCHAL_HAVE_LOOPS 1 > >> > >> > >> Hi Felix, > >> > >> I like this patch, and expect I will approve it. However, I would like > >> you to do two more things before I do: > >> > >> 1. Ensure it doesn't generate zcl's when: > >> > >> #define XCHAL_HAVE_LOOPS 0 > >> > >> 2. Ensure it doesn't produce loops bodies that contain ret, retw, > >> ret.n or retw.n as the last instruction. It might be easier to just > >> disallow them in loop bodies entirely though. > >> > >> Thanks!
On Fri, Oct 10, 2014 at 6:59 AM, Felix Yang <fei.yang0953@gmail.com> wrote: > Hi Sterling, > > I made some improvement to the patch. Two changes: > 1. TARGET_LOOPS is now used as a condition of the doloop related > patterns, which is more elegant. Fine. > 2. As the trip count register of the zero-cost loop maybe > potentially spilled, we need to change the patterns in order to handle > this issue. Actually, for xtensa you don't. The trip count is copied into LCOUNT at the execution of the loop instruction, and therefore a spill or whatever doesn't matter--it won't affect the result. So as long as you have the trip count at the start of the loop, you are fine. This does bring up an issue of whether or not the trip count can be modified during the loop. (note that this is different than early exit.) If it can, you can't use a zero-overhead loop. Does your patch address this case. The solution is similar to that adapted by c6x backend. > Just turn the zero-cost loop into a regular loop when that happens > when reload is completed. > Attached please find version 4 of the patch. Make check regression > tested with xtensa-elf-gcc/simulator. > OK for trunk?
Thanks for the comments. The patch checked the usage of teh trip count register, making sure that it is not used in the loop body other than the doloop_end or lives past the doloop_end instruction, as the following code snippet shows: + /* Scan all the blocks to make sure they don't use iter_reg. */ + if (loop->iter_reg_used || loop->iter_reg_used_outside) + { + if (dump_file) + fprintf (dump_file, ";; loop %d uses iterator\n", + loop->loop_no); + return false; + } For the spill issue, I think we need to handle it. The reason is that currently we are not telling GCC about the existence of the LCOUNT register. Instead, we keep the trip count in a general register and it's possible that this register can be spilled when register pressure is high. It's a good idea to post another patch to describe the LCOUNT register in GCC in order to free this general register. But I want this patch applied as a first step, OK? Cheers, Felix On Tue, Oct 14, 2014 at 12:09 AM, augustine.sterling@gmail.com <augustine.sterling@gmail.com> wrote: > On Fri, Oct 10, 2014 at 6:59 AM, Felix Yang <fei.yang0953@gmail.com> wrote: >> Hi Sterling, >> >> I made some improvement to the patch. Two changes: >> 1. TARGET_LOOPS is now used as a condition of the doloop related >> patterns, which is more elegant. > > Fine. > >> 2. As the trip count register of the zero-cost loop maybe >> potentially spilled, we need to change the patterns in order to handle >> this issue. > > Actually, for xtensa you don't. The trip count is copied into LCOUNT > at the execution of the loop instruction, and therefore a spill or > whatever doesn't matter--it won't affect the result. So as long as you > have the trip count at the start of the loop, you are fine. > > This does bring up an issue of whether or not the trip count can be > modified during the loop. (note that this is different than early > exit.) If it can, you can't use a zero-overhead loop. Does your patch > address this case. > > The solution is similar to that adapted by c6x backend. >> Just turn the zero-cost loop into a regular loop when that happens >> when reload is completed. >> Attached please find version 4 of the patch. Make check regression >> tested with xtensa-elf-gcc/simulator. >> OK for trunk?
PING? Cheers, Felix On Tue, Oct 14, 2014 at 12:30 AM, Felix Yang <fei.yang0953@gmail.com> wrote: > Thanks for the comments. > > The patch checked the usage of teh trip count register, making sure > that it is not used in the loop body other than the doloop_end or > lives past the doloop_end instruction, as the following code snippet > shows: > > + /* Scan all the blocks to make sure they don't use iter_reg. */ > + if (loop->iter_reg_used || loop->iter_reg_used_outside) > + { > + if (dump_file) > + fprintf (dump_file, ";; loop %d uses iterator\n", > + loop->loop_no); > + return false; > + } > > For the spill issue, I think we need to handle it. The reason is > that currently we are not telling GCC about the existence of the > LCOUNT register. Instead, we keep the trip count in a general register > and it's possible that this register can be spilled when register > pressure is high. > It's a good idea to post another patch to describe the LCOUNT > register in GCC in order to free this general register. But I want > this patch applied as a first step, OK? > > Cheers, > Felix > > > On Tue, Oct 14, 2014 at 12:09 AM, augustine.sterling@gmail.com > <augustine.sterling@gmail.com> wrote: >> On Fri, Oct 10, 2014 at 6:59 AM, Felix Yang <fei.yang0953@gmail.com> wrote: >>> Hi Sterling, >>> >>> I made some improvement to the patch. Two changes: >>> 1. TARGET_LOOPS is now used as a condition of the doloop related >>> patterns, which is more elegant. >> >> Fine. >> >>> 2. As the trip count register of the zero-cost loop maybe >>> potentially spilled, we need to change the patterns in order to handle >>> this issue. >> >> Actually, for xtensa you don't. The trip count is copied into LCOUNT >> at the execution of the loop instruction, and therefore a spill or >> whatever doesn't matter--it won't affect the result. So as long as you >> have the trip count at the start of the loop, you are fine. >> >> This does bring up an issue of whether or not the trip count can be >> modified during the loop. (note that this is different than early >> exit.) If it can, you can't use a zero-overhead loop. Does your patch >> address this case. >> >> The solution is similar to that adapted by c6x backend. >>> Just turn the zero-cost loop into a regular loop when that happens >>> when reload is completed. >>> Attached please find version 4 of the patch. Make check regression >>> tested with xtensa-elf-gcc/simulator. >>> OK for trunk?
On Tue, Oct 14, 2014 at 8:39 AM, Felix Yang <fei.yang0953@gmail.com> wrote: > PING? > Cheers, > Felix Felix, This isn't my day job, 24-hour pings are unproductive. You shouldn't need to worry about the trip count register getting spilled. It makes no difference whatsoever to how the loop operates--the trip count is dead with regards to the loop once the instruction executes. You don't need to describe LCOUNT to gcc in order for this not to matter. It should be enough to describe the zcl as consuming the value in the same way a branch instruction consumes a value. If you have a case where spilling it is causing a problem, then there is a bug in your code, papered over by dropping case when it is spilled. Similarly with iter_reg_used_outside--it shouldn't affect whether or not a zcl is valid here. If you have a case where it does, there is likely a bug in your code. If the code is easier to write by maintaining trip_count up, then fine (for now); you give up some performance (in fact, a lot of performance), but that doesn't matter as to the correctness. > > > On Tue, Oct 14, 2014 at 12:30 AM, Felix Yang <fei.yang0953@gmail.com> wrote: >> Thanks for the comments. >> >> The patch checked the usage of teh trip count register, making sure >> that it is not used in the loop body other than the doloop_end or >> lives past the doloop_end instruction, as the following code snippet >> shows: >> >> + /* Scan all the blocks to make sure they don't use iter_reg. */ >> + if (loop->iter_reg_used || loop->iter_reg_used_outside) >> + { >> + if (dump_file) >> + fprintf (dump_file, ";; loop %d uses iterator\n", >> + loop->loop_no); >> + return false; >> + } >> >> For the spill issue, I think we need to handle it. The reason is >> that currently we are not telling GCC about the existence of the >> LCOUNT register. Instead, we keep the trip count in a general register >> and it's possible that this register can be spilled when register >> pressure is high. >> It's a good idea to post another patch to describe the LCOUNT >> register in GCC in order to free this general register. But I want >> this patch applied as a first step, OK? >> >> Cheers, >> Felix >> >> >> On Tue, Oct 14, 2014 at 12:09 AM, augustine.sterling@gmail.com >> <augustine.sterling@gmail.com> wrote: >>> On Fri, Oct 10, 2014 at 6:59 AM, Felix Yang <fei.yang0953@gmail.com> wrote: >>>> Hi Sterling, >>>> >>>> I made some improvement to the patch. Two changes: >>>> 1. TARGET_LOOPS is now used as a condition of the doloop related >>>> patterns, which is more elegant. >>> >>> Fine. >>> >>>> 2. As the trip count register of the zero-cost loop maybe >>>> potentially spilled, we need to change the patterns in order to handle >>>> this issue. >>> >>> Actually, for xtensa you don't. The trip count is copied into LCOUNT >>> at the execution of the loop instruction, and therefore a spill or >>> whatever doesn't matter--it won't affect the result. So as long as you >>> have the trip count at the start of the loop, you are fine. >>> >>> This does bring up an issue of whether or not the trip count can be >>> modified during the loop. (note that this is different than early >>> exit.) If it can, you can't use a zero-overhead loop. Does your patch >>> address this case. >>> >>> The solution is similar to that adapted by c6x backend. >>>> Just turn the zero-cost loop into a regular loop when that happens >>>> when reload is completed. >>>> Attached please find version 4 of the patch. Make check regression >>>> tested with xtensa-elf-gcc/simulator. >>>> OK for trunk?
Hi Sterling, Since the patch is delayed for a long time, I'm kind of pushing it. Sorry for that. Yeah, you are right. We have some performance issue here as GCC may use one more general register in some cases with this patch. Take the following arraysum testcase for example. In doloop optimization, GCC figures out that the number of iterations is 1024 and creates a new pseudo 79 as the new trip count register. The pseudo 79 is live throughout the loop, this makes the register pressure in the loop higher. And it's possible that this new pseudo is spilled by reload when the register pressure is very high. I know that the xtensa loop instruction copies the trip count register into the LCOUNT special register. And we need describe this hardware feature in GCC in order to free the trip count register. But I find it difficult to do. Do you have any good suggestions on this? arraysum.c: int g[1024]; int g_sum; void test_entry () { int i, Sum = 0; for (i = 0; i < 1024; i++) Sum = Sum + g[i]; g_sum = Sum; } 1. RTL before the doloop optimization pass(arraysum.c.193r.loop2_invariant): (note 34 0 32 2 [bb 2] NOTE_INSN_BASIC_BLOCK) (note 32 34 36 2 NOTE_INSN_FUNCTION_BEG) (insn 36 32 37 2 (set (reg:SI 72 [ ivtmp$8 ]) (mem/u/c:SI (symbol_ref/u:SI ("*.LC2") [flags 0x2]) [2 S4 A32])) 29 {movsi_internal} (expr_list:REG_EQUAL (symbol_ref:SI ("g") <var_decl 0x7f6eef5d62d0 g>) (nil))) (insn 37 36 33 2 (set (reg/f:SI 76 [ D.1393 ]) (mem/u/c:SI (symbol_ref/u:SI ("*.LC3") [flags 0x2]) [2 S4 A32])) 29 {movsi_internal} (expr_list:REG_EQUAL (const:SI (plus:SI (symbol_ref:SI ("g") <var_decl 0x7f6eef5d62d0 g>) (const_int 4096 [0x1000]))) (nil))) (insn 33 37 42 2 (set (reg/v:SI 74 [ Sum ]) (const_int 0 [0])) arraysum.c:6 29 {movsi_internal} (nil)) (code_label 42 33 38 3 2 "" [0 uses]) (note 38 42 39 3 [bb 3] NOTE_INSN_BASIC_BLOCK) (insn 39 38 40 3 (set (reg:SI 77 [ MEM[base: _14, offset: 0B] ]) (mem:SI (reg:SI 72 [ ivtmp$8 ]) [2 MEM[base: _14, offset: 0B]+0 S4 A32])) arraysum.c:9 29 {movsi_internal} (nil)) (insn 40 39 41 3 (set (reg/v:SI 74 [ Sum ]) (plus:SI (reg/v:SI 74 [ Sum ]) (reg:SI 77 [ MEM[base: _14, offset: 0B] ]))) arraysum.c:9 1 {addsi3} (expr_list:REG_DEAD (reg:SI 77 [ MEM[base: _14, offset: 0B] ]) (nil))) (insn 41 40 43 3 (set (reg:SI 72 [ ivtmp$8 ]) (plus:SI (reg:SI 72 [ ivtmp$8 ]) (const_int 4 [0x4]))) 1 {addsi3} (nil)) (jump_insn 43 41 52 3 (set (pc) (if_then_else (ne (reg:SI 72 [ ivtmp$8 ]) (reg/f:SI 76 [ D.1393 ])) (label_ref:SI 52) (pc))) arraysum.c:8 39 {*btrue} (int_list:REG_BR_PROB 9899 (nil)) -> 52) (code_label 52 43 51 5 3 "" [1 uses]) (note 51 52 44 5 [bb 5] NOTE_INSN_BASIC_BLOCK) (note 44 51 45 4 [bb 4] NOTE_INSN_BASIC_BLOCK) (insn 45 44 46 4 (set (reg/f:SI 78) (mem/u/c:SI (symbol_ref/u:SI ("*.LC4") [flags 0x2]) [2 S4 A32])) arraysum.c:11 29 {movsi_internal} (expr_list:REG_EQUAL (symbol_ref:SI ("g_sum") <var_decl 0x7f6eef5d6360 g_sum>) (nil))) (insn 46 45 0 4 (set (mem/c:SI (reg/f:SI 78) [2 g_sum+0 S4 A32]) (reg/v:SI 74 [ Sum ])) arraysum.c:11 29 {movsi_internal} (expr_list:REG_DEAD (reg/f:SI 78) (expr_list:REG_DEAD (reg/v:SI 74 [ Sum ]) (nil)))) 2. RTL after the doloop optimization pass(arraysum.c.195r.loop2_doloop): (note 34 0 32 2 [bb 2] NOTE_INSN_BASIC_BLOCK) (note 32 34 36 2 NOTE_INSN_FUNCTION_BEG) (insn 36 32 37 2 (set (reg:SI 72 [ ivtmp$8 ]) (mem/u/c:SI (symbol_ref/u:SI ("*.LC2") [flags 0x2]) [2 S4 A32])) 29 {movsi_internal} (expr_list:REG_EQUAL (symbol_ref:SI ("g") <var_decl 0x7f6eef5d62d0 g>) (nil))) (insn 37 36 33 2 (set (reg/f:SI 76 [ D.1393 ]) (mem/u/c:SI (symbol_ref/u:SI ("*.LC3") [flags 0x2]) [2 S4 A32])) 29 {movsi_internal} (expr_list:REG_EQUAL (const:SI (plus:SI (symbol_ref:SI ("g") <var_decl 0x7f6eef5d62d0 g>) (const_int 4096 [0x1000]))) (nil))) (insn 33 37 54 2 (set (reg/v:SI 74 [ Sum ]) (const_int 0 [0])) arraysum.c:6 29 {movsi_internal} (nil)) (insn 54 33 42 2 (set (reg:SI 79) (const_int 1024 [0x400])) arraysum.c:6 -1 (nil)) (code_label 42 54 38 3 2 "" [0 uses]) (note 38 42 39 3 [bb 3] NOTE_INSN_BASIC_BLOCK) (insn 39 38 40 3 (set (reg:SI 77 [ MEM[base: _14, offset: 0B] ]) (mem:SI (reg:SI 72 [ ivtmp$8 ]) [2 MEM[base: _14, offset: 0B]+0 S4 A32])) arraysum.c:9 29 {movsi_internal} (nil)) (insn 40 39 41 3 (set (reg/v:SI 74 [ Sum ]) (plus:SI (reg/v:SI 74 [ Sum ]) (reg:SI 77 [ MEM[base: _14, offset: 0B] ]))) arraysum.c:9 1 {addsi3} (expr_list:REG_DEAD (reg:SI 77 [ MEM[base: _14, offset: 0B] ]) (nil))) (insn 41 40 53 3 (set (reg:SI 72 [ ivtmp$8 ]) (plus:SI (reg:SI 72 [ ivtmp$8 ]) (const_int 4 [0x4]))) 1 {addsi3} (nil)) (jump_insn 53 41 52 3 (parallel [ (set (pc) (if_then_else (ne (reg:SI 79) (const_int 1 [0x1])) (label_ref 52) (pc))) (set (reg:SI 79) (plus:SI (reg:SI 79) (const_int -1 [0xffffffffffffffff]))) (unspec [ (const_int 0 [0]) ] 13) (clobber (scratch:SI)) ]) -1 (int_list:REG_BR_PROB 9899 (nil)) -> 52) (code_label 52 53 51 5 3 "" [1 uses]) (note 51 52 44 5 [bb 5] NOTE_INSN_BASIC_BLOCK) (note 44 51 45 4 [bb 4] NOTE_INSN_BASIC_BLOCK) (insn 45 44 46 4 (set (reg/f:SI 78) (mem/u/c:SI (symbol_ref/u:SI ("*.LC4") [flags 0x2]) [2 S4 A32])) arraysum.c:11 29 {movsi_internal} (expr_list:REG_EQUAL (symbol_ref:SI ("g_sum") <var_decl 0x7f6eef5d6360 g_sum>) (nil))) (insn 46 45 0 4 (set (mem/c:SI (reg/f:SI 78) [2 g_sum+0 S4 A32]) (reg/v:SI 74 [ Sum ])) arraysum.c:11 29 {movsi_internal} (expr_list:REG_DEAD (reg/f:SI 78) (expr_list:REG_DEAD (reg/v:SI 74 [ Sum ]) (nil)))) > > On Tue, Oct 14, 2014 at 8:39 AM, Felix Yang <fei.yang0953@gmail.com> wrote: > > PING? > > Cheers, > > Felix > > Felix, > > This isn't my day job, 24-hour pings are unproductive. > > You shouldn't need to worry about the trip count register getting spilled. It > makes no difference whatsoever to how the loop operates--the trip count is > dead with regards to the loop once the instruction executes. You don't need to > describe LCOUNT to gcc in order for this not to matter. It should be enough to > describe the zcl as consuming the value in the same way a branch instruction > consumes a value. > > If you have a case where spilling it is causing a problem, then there is a bug in > your code, papered over by dropping case when it is spilled. Similarly with > iter_reg_used_outside--it shouldn't affect whether or not a zcl is valid here. If > you have a case where it does, there is likely a bug in your code. > > If the code is easier to write by maintaining trip_count up, then fine (for now); > you give up some performance (in fact, a lot of performance), but that doesn't > matter as to the correctness. > > > > > > > > On Tue, Oct 14, 2014 at 12:30 AM, Felix Yang <fei.yang0953@gmail.com> > wrote: > >> Thanks for the comments. > >> > >> The patch checked the usage of teh trip count register, making sure > >> that it is not used in the loop body other than the doloop_end or > >> lives past the doloop_end instruction, as the following code snippet > >> shows: > >> > >> + /* Scan all the blocks to make sure they don't use iter_reg. */ > >> + if (loop->iter_reg_used || loop->iter_reg_used_outside) > >> + { > >> + if (dump_file) > >> + fprintf (dump_file, ";; loop %d uses iterator\n", > >> + loop->loop_no); > >> + return false; > >> + } > >> > >> For the spill issue, I think we need to handle it. The reason is > >> that currently we are not telling GCC about the existence of the > >> LCOUNT register. Instead, we keep the trip count in a general > >> register and it's possible that this register can be spilled when > >> register pressure is high. > >> It's a good idea to post another patch to describe the LCOUNT > >> register in GCC in order to free this general register. But I want > >> this patch applied as a first step, OK? > >> > >> Cheers, > >> Felix > >> > >> > >> On Tue, Oct 14, 2014 at 12:09 AM, augustine.sterling@gmail.com > >> <augustine.sterling@gmail.com> wrote: > >>> On Fri, Oct 10, 2014 at 6:59 AM, Felix Yang <fei.yang0953@gmail.com> > wrote: > >>>> Hi Sterling, > >>>> > >>>> I made some improvement to the patch. Two changes: > >>>> 1. TARGET_LOOPS is now used as a condition of the doloop > >>>> related patterns, which is more elegant. > >>> > >>> Fine. > >>> > >>>> 2. As the trip count register of the zero-cost loop maybe > >>>> potentially spilled, we need to change the patterns in order to > >>>> handle this issue. > >>> > >>> Actually, for xtensa you don't. The trip count is copied into LCOUNT > >>> at the execution of the loop instruction, and therefore a spill or > >>> whatever doesn't matter--it won't affect the result. So as long as > >>> you have the trip count at the start of the loop, you are fine. > >>> > >>> This does bring up an issue of whether or not the trip count can be > >>> modified during the loop. (note that this is different than early > >>> exit.) If it can, you can't use a zero-overhead loop. Does your > >>> patch address this case. > >>> > >>> The solution is similar to that adapted by c6x backend. > >>>> Just turn the zero-cost loop into a regular loop when that happens > >>>> when reload is completed. > >>>> Attached please find version 4 of the patch. Make check > >>>> regression tested with xtensa-elf-gcc/simulator. > >>>> OK for trunk?
On Wed, Oct 15, 2014 at 7:10 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote: > Hi Sterling, > > Since the patch is delayed for a long time, I'm kind of pushing it. Sorry for that. > Yeah, you are right. We have some performance issue here as GCC may use one more general register in some cases with this patch. > Take the following arraysum testcase for example. In doloop optimization, GCC figures out that the number of iterations is 1024 and creates a new pseudo 79 as the new trip count register. > The pseudo 79 is live throughout the loop, this makes the register pressure in the loop higher. And it's possible that this new pseudo is spilled by reload when the register pressure is very high. > I know that the xtensa loop instruction copies the trip count register into the LCOUNT special register. And we need describe this hardware feature in GCC in order to free the trip count register. > But I find it difficult to do. Do you have any good suggestions on this? There are two issues related to the trip count, one I would like you to solve now, one later. 1. Later: The trip count doesn't need to be updated at all inside these loops, once the loop instruction executes. The code below relates to this case. 2. Now: You should be able to use a loop instruction regardless of whether the trip count is spilled. If you have an example where that wouldn't work, I would love to see it. > > arraysum.c: > int g[1024]; > int g_sum; > > void test_entry () > { > int i, Sum = 0; > > for (i = 0; i < 1024; i++) > Sum = Sum + g[i]; > > g_sum = Sum; > } > > > 1. RTL before the doloop optimization pass(arraysum.c.193r.loop2_invariant): > (note 34 0 32 2 [bb 2] NOTE_INSN_BASIC_BLOCK) > (note 32 34 36 2 NOTE_INSN_FUNCTION_BEG) > (insn 36 32 37 2 (set (reg:SI 72 [ ivtmp$8 ]) > (mem/u/c:SI (symbol_ref/u:SI ("*.LC2") [flags 0x2]) [2 S4 A32])) 29 {movsi_internal} > (expr_list:REG_EQUAL (symbol_ref:SI ("g") <var_decl 0x7f6eef5d62d0 g>) > (nil))) > (insn 37 36 33 2 (set (reg/f:SI 76 [ D.1393 ]) > (mem/u/c:SI (symbol_ref/u:SI ("*.LC3") [flags 0x2]) [2 S4 A32])) 29 {movsi_internal} > (expr_list:REG_EQUAL (const:SI (plus:SI (symbol_ref:SI ("g") <var_decl 0x7f6eef5d62d0 g>) > (const_int 4096 [0x1000]))) > (nil))) > (insn 33 37 42 2 (set (reg/v:SI 74 [ Sum ]) > (const_int 0 [0])) arraysum.c:6 29 {movsi_internal} > (nil)) > (code_label 42 33 38 3 2 "" [0 uses]) > (note 38 42 39 3 [bb 3] NOTE_INSN_BASIC_BLOCK) > (insn 39 38 40 3 (set (reg:SI 77 [ MEM[base: _14, offset: 0B] ]) > (mem:SI (reg:SI 72 [ ivtmp$8 ]) [2 MEM[base: _14, offset: 0B]+0 S4 A32])) arraysum.c:9 29 {movsi_internal} > (nil)) > (insn 40 39 41 3 (set (reg/v:SI 74 [ Sum ]) > (plus:SI (reg/v:SI 74 [ Sum ]) > (reg:SI 77 [ MEM[base: _14, offset: 0B] ]))) arraysum.c:9 1 {addsi3} > (expr_list:REG_DEAD (reg:SI 77 [ MEM[base: _14, offset: 0B] ]) > (nil))) > (insn 41 40 43 3 (set (reg:SI 72 [ ivtmp$8 ]) > (plus:SI (reg:SI 72 [ ivtmp$8 ]) > (const_int 4 [0x4]))) 1 {addsi3} > (nil)) > (jump_insn 43 41 52 3 (set (pc) > (if_then_else (ne (reg:SI 72 [ ivtmp$8 ]) > (reg/f:SI 76 [ D.1393 ])) > (label_ref:SI 52) > (pc))) arraysum.c:8 39 {*btrue} > (int_list:REG_BR_PROB 9899 (nil)) > -> 52) > (code_label 52 43 51 5 3 "" [1 uses]) > (note 51 52 44 5 [bb 5] NOTE_INSN_BASIC_BLOCK) > (note 44 51 45 4 [bb 4] NOTE_INSN_BASIC_BLOCK) > (insn 45 44 46 4 (set (reg/f:SI 78) > (mem/u/c:SI (symbol_ref/u:SI ("*.LC4") [flags 0x2]) [2 S4 A32])) arraysum.c:11 29 {movsi_internal} > (expr_list:REG_EQUAL (symbol_ref:SI ("g_sum") <var_decl 0x7f6eef5d6360 g_sum>) > (nil))) > (insn 46 45 0 4 (set (mem/c:SI (reg/f:SI 78) [2 g_sum+0 S4 A32]) > (reg/v:SI 74 [ Sum ])) arraysum.c:11 29 {movsi_internal} > (expr_list:REG_DEAD (reg/f:SI 78) > (expr_list:REG_DEAD (reg/v:SI 74 [ Sum ]) > (nil)))) > > > 2. RTL after the doloop optimization pass(arraysum.c.195r.loop2_doloop): > (note 34 0 32 2 [bb 2] NOTE_INSN_BASIC_BLOCK) > (note 32 34 36 2 NOTE_INSN_FUNCTION_BEG) > (insn 36 32 37 2 (set (reg:SI 72 [ ivtmp$8 ]) > (mem/u/c:SI (symbol_ref/u:SI ("*.LC2") [flags 0x2]) [2 S4 A32])) 29 {movsi_internal} > (expr_list:REG_EQUAL (symbol_ref:SI ("g") <var_decl 0x7f6eef5d62d0 g>) > (nil))) > (insn 37 36 33 2 (set (reg/f:SI 76 [ D.1393 ]) > (mem/u/c:SI (symbol_ref/u:SI ("*.LC3") [flags 0x2]) [2 S4 A32])) 29 {movsi_internal} > (expr_list:REG_EQUAL (const:SI (plus:SI (symbol_ref:SI ("g") <var_decl 0x7f6eef5d62d0 g>) > (const_int 4096 [0x1000]))) > (nil))) > (insn 33 37 54 2 (set (reg/v:SI 74 [ Sum ]) > (const_int 0 [0])) arraysum.c:6 29 {movsi_internal} > (nil)) > (insn 54 33 42 2 (set (reg:SI 79) > (const_int 1024 [0x400])) arraysum.c:6 -1 > (nil)) > (code_label 42 54 38 3 2 "" [0 uses]) > (note 38 42 39 3 [bb 3] NOTE_INSN_BASIC_BLOCK) > (insn 39 38 40 3 (set (reg:SI 77 [ MEM[base: _14, offset: 0B] ]) > (mem:SI (reg:SI 72 [ ivtmp$8 ]) [2 MEM[base: _14, offset: 0B]+0 S4 A32])) arraysum.c:9 29 {movsi_internal} > (nil)) > (insn 40 39 41 3 (set (reg/v:SI 74 [ Sum ]) > (plus:SI (reg/v:SI 74 [ Sum ]) > (reg:SI 77 [ MEM[base: _14, offset: 0B] ]))) arraysum.c:9 1 {addsi3} > (expr_list:REG_DEAD (reg:SI 77 [ MEM[base: _14, offset: 0B] ]) > (nil))) > (insn 41 40 53 3 (set (reg:SI 72 [ ivtmp$8 ]) > (plus:SI (reg:SI 72 [ ivtmp$8 ]) > (const_int 4 [0x4]))) 1 {addsi3} > (nil)) > (jump_insn 53 41 52 3 (parallel [ > (set (pc) > (if_then_else (ne (reg:SI 79) > (const_int 1 [0x1])) > (label_ref 52) > (pc))) > (set (reg:SI 79) > (plus:SI (reg:SI 79) > (const_int -1 [0xffffffffffffffff]))) > (unspec [ > (const_int 0 [0]) > ] 13) > (clobber (scratch:SI)) > ]) -1 > (int_list:REG_BR_PROB 9899 (nil)) > -> 52) > (code_label 52 53 51 5 3 "" [1 uses]) > (note 51 52 44 5 [bb 5] NOTE_INSN_BASIC_BLOCK) > (note 44 51 45 4 [bb 4] NOTE_INSN_BASIC_BLOCK) > (insn 45 44 46 4 (set (reg/f:SI 78) > (mem/u/c:SI (symbol_ref/u:SI ("*.LC4") [flags 0x2]) [2 S4 A32])) arraysum.c:11 29 {movsi_internal} > (expr_list:REG_EQUAL (symbol_ref:SI ("g_sum") <var_decl 0x7f6eef5d6360 g_sum>) > (nil))) > (insn 46 45 0 4 (set (mem/c:SI (reg/f:SI 78) [2 g_sum+0 S4 A32]) > (reg/v:SI 74 [ Sum ])) arraysum.c:11 29 {movsi_internal} > (expr_list:REG_DEAD (reg/f:SI 78) > (expr_list:REG_DEAD (reg/v:SI 74 [ Sum ]) > (nil)))) > > >> >> On Tue, Oct 14, 2014 at 8:39 AM, Felix Yang <fei.yang0953@gmail.com> wrote: >> > PING? >> > Cheers, >> > Felix >> >> Felix, >> >> This isn't my day job, 24-hour pings are unproductive. >> >> You shouldn't need to worry about the trip count register getting spilled. It >> makes no difference whatsoever to how the loop operates--the trip count is >> dead with regards to the loop once the instruction executes. You don't need to >> describe LCOUNT to gcc in order for this not to matter. It should be enough to >> describe the zcl as consuming the value in the same way a branch instruction >> consumes a value. >> >> If you have a case where spilling it is causing a problem, then there is a bug in >> your code, papered over by dropping case when it is spilled. Similarly with >> iter_reg_used_outside--it shouldn't affect whether or not a zcl is valid here. If >> you have a case where it does, there is likely a bug in your code. >> >> If the code is easier to write by maintaining trip_count up, then fine (for now); >> you give up some performance (in fact, a lot of performance), but that doesn't >> matter as to the correctness. >> >> >> > >> > >> > On Tue, Oct 14, 2014 at 12:30 AM, Felix Yang <fei.yang0953@gmail.com> >> wrote: >> >> Thanks for the comments. >> >> >> >> The patch checked the usage of teh trip count register, making sure >> >> that it is not used in the loop body other than the doloop_end or >> >> lives past the doloop_end instruction, as the following code snippet >> >> shows: >> >> >> >> + /* Scan all the blocks to make sure they don't use iter_reg. */ >> >> + if (loop->iter_reg_used || loop->iter_reg_used_outside) >> >> + { >> >> + if (dump_file) >> >> + fprintf (dump_file, ";; loop %d uses iterator\n", >> >> + loop->loop_no); >> >> + return false; >> >> + } >> >> >> >> For the spill issue, I think we need to handle it. The reason is >> >> that currently we are not telling GCC about the existence of the >> >> LCOUNT register. Instead, we keep the trip count in a general >> >> register and it's possible that this register can be spilled when >> >> register pressure is high. >> >> It's a good idea to post another patch to describe the LCOUNT >> >> register in GCC in order to free this general register. But I want >> >> this patch applied as a first step, OK? >> >> >> >> Cheers, >> >> Felix >> >> >> >> >> >> On Tue, Oct 14, 2014 at 12:09 AM, augustine.sterling@gmail.com >> >> <augustine.sterling@gmail.com> wrote: >> >>> On Fri, Oct 10, 2014 at 6:59 AM, Felix Yang <fei.yang0953@gmail.com> >> wrote: >> >>>> Hi Sterling, >> >>>> >> >>>> I made some improvement to the patch. Two changes: >> >>>> 1. TARGET_LOOPS is now used as a condition of the doloop >> >>>> related patterns, which is more elegant. >> >>> >> >>> Fine. >> >>> >> >>>> 2. As the trip count register of the zero-cost loop maybe >> >>>> potentially spilled, we need to change the patterns in order to >> >>>> handle this issue. >> >>> >> >>> Actually, for xtensa you don't. The trip count is copied into LCOUNT >> >>> at the execution of the loop instruction, and therefore a spill or >> >>> whatever doesn't matter--it won't affect the result. So as long as >> >>> you have the trip count at the start of the loop, you are fine. >> >>> >> >>> This does bring up an issue of whether or not the trip count can be >> >>> modified during the loop. (note that this is different than early >> >>> exit.) If it can, you can't use a zero-overhead loop. Does your >> >>> patch address this case. >> >>> >> >>> The solution is similar to that adapted by c6x backend. >> >>>> Just turn the zero-cost loop into a regular loop when that happens >> >>>> when reload is completed. >> >>>> Attached please find version 4 of the patch. Make check >> >>>> regression tested with xtensa-elf-gcc/simulator. >> >>>> OK for trunk?
Hi Sterling, Attached please find the testcase for the spill issue. Try it out with the patch :-) > > On Wed, Oct 15, 2014 at 7:10 PM, Yangfei (Felix) <felix.yang@huawei.com> > wrote: > > Hi Sterling, > > > > Since the patch is delayed for a long time, I'm kind of pushing it. Sorry for > that. > > Yeah, you are right. We have some performance issue here as GCC may > use one more general register in some cases with this patch. > > Take the following arraysum testcase for example. In doloop optimization, > GCC figures out that the number of iterations is 1024 and creates a new pseudo > 79 as the new trip count register. > > The pseudo 79 is live throughout the loop, this makes the register > pressure in the loop higher. And it's possible that this new pseudo is spilled by > reload when the register pressure is very high. > > I know that the xtensa loop instruction copies the trip count register into > the LCOUNT special register. And we need describe this hardware feature in GCC > in order to free the trip count register. > > But I find it difficult to do. Do you have any good suggestions on this? > > There are two issues related to the trip count, one I would like you to solve now, > one later. > > 1. Later: The trip count doesn't need to be updated at all inside these loops, once > the loop instruction executes. The code below relates to this case. > > 2. Now: You should be able to use a loop instruction regardless of whether the > trip count is spilled. If you have an example where that wouldn't work, I would > love to see it. > void foo (unsigned f, long v, unsigned *w, unsigned a, unsigned b, unsigned e, unsigned c, unsigned d) { unsigned h = v / 4, x[16]; while (f < h) { unsigned i; f++; a |= (a >> 30); d = (d << 30) | ((unsigned) d >> 30); c = (c << 30) | ((unsigned) c >> 30); b = 30 | ((unsigned) b >> 30); d += a = (a << 30) | ((unsigned) a >> 2); c += ((d << 5) | (d >> 27)) + ((e & (a ^ b))) + 0x5a827999 + x[12]; a += (c & e); c = 30 | ((unsigned) c); i = x[5] ^ x[7] ^ x[8] ^ x[3]; x[5] = (i << 1) | ((unsigned) i >> 31); i = x[6] ^ x[2] ^ x[14] ^ x[13]; x[6] = (i << 1) | (i >> 31); b += (c | (c >> 5)) + (d ^ e) + 0x6ed9eba1 + (x[7] = (i << 1) | ((unsigned) i >> 31)); x[8] = i | 1; e += (a | 5) + b + (i = x[9] ^ x[6], x[10] = (i << (unsigned) i)); e = 30 | ((unsigned) e >> 30); i = x[12] ^ x[14] ^ x[12] ^ x[12], (x[12] = 1 | ((unsigned) i)); i = x[13] ^ x[5] ^ x[10], (x[13] = (i << (unsigned) 1)); i = x[2] ^ x[7] ^ x[12], (x[15] = i | ((unsigned) i >> 1)); i = x[2] ^ x[0] ^ x[13], (x[0] = (i << 1) | 31); e = (e << 30) | 2; i = x[14] ^ x[2] ^ x[15], (x[2] = i | 1); x[3] = i | ((unsigned) i); i = x[14] ^ x[12] ^ x[4], (x[4] = 1 | ((unsigned) i >> 1)); x[5] = i | 1; e = (e << 30) | 30; b += (5 | ((unsigned) e >> 5)) + 0x8f1bbcdc + (x[9] = (i | ((unsigned) i >> 1))); i = x[2] ^ (x [10] = ((i << 1) | (i >> 1))); x[13] = (i | ((unsigned) i >> 1)); (i = x[14] ^ x[0] ^ x[14], (x[14] = ((i << 1) | 31))); a = *w += a; } }
If the tripcount spill issue is not handled in the pattern, ICE may happen then. Here reload is trying to spill pseudo 173, but a memory operand is not allowed in zero_cost_loop_end pattern. And this is what I am trying to solve. pr44023.c:48:1: error: unable to generate reloads for: } ^ (jump_insn 136 113 116 4 (parallel [ (set (pc) (if_then_else (ne (reg:SI 173) (const_int 1 [0x1])) (label_ref:SI 114) (pc))) (set (reg:SI 173) (plus:SI (reg:SI 173) (const_int -1 [0xffffffffffffffff]))) (unspec [ (const_int 0 [0]) ] 13) ]) pr44023.c:46 48 {zero_cost_loop_end} (int_list:REG_BR_PROB 9100 (nil)) -> 114) pr44023.c:48:1: internal compiler error: in find_reloads, at reload.c:3833 0x989383 _fatal_insn(char const*, rtx_def const*, char const*, int, char const*) ../../trunk/gcc/rtl-error.c:110 0x9777cb find_reloads(rtx_insn*, int, int, int, short*) ../../trunk/gcc/reload.c:3833 0x9875f1 calculate_needs_all_insns ../../trunk/gcc/reload1.c:1515 0x9875f1 reload(rtx_insn*, int) ../../trunk/gcc/reload1.c:1003 0x86a8a7 do_reload ../../trunk/gcc/ira.c:5323 0x86acd8 execute ../../trunk/gcc/ira.c:5470 > > Hi Sterling, > > Attached please find the testcase for the spill issue. Try it out with the > patch :-) > > > > > > On Wed, Oct 15, 2014 at 7:10 PM, Yangfei (Felix) > > <felix.yang@huawei.com> > > wrote: > > > Hi Sterling, > > > > > > Since the patch is delayed for a long time, I'm kind of pushing > > > it. Sorry for > > that. > > > Yeah, you are right. We have some performance issue here as GCC > > > may > > use one more general register in some cases with this patch. > > > Take the following arraysum testcase for example. In doloop > > > optimization, > > GCC figures out that the number of iterations is 1024 and creates a > > new pseudo > > 79 as the new trip count register. > > > The pseudo 79 is live throughout the loop, this makes the > > > register > > pressure in the loop higher. And it's possible that this new pseudo is > > spilled by reload when the register pressure is very high. > > > I know that the xtensa loop instruction copies the trip count > > > register into > > the LCOUNT special register. And we need describe this hardware > > feature in GCC in order to free the trip count register. > > > But I find it difficult to do. Do you have any good suggestions on this? > > > > There are two issues related to the trip count, one I would like you > > to solve now, one later. > > > > 1. Later: The trip count doesn't need to be updated at all inside > > these loops, once the loop instruction executes. The code below relates to this > case. > > > > 2. Now: You should be able to use a loop instruction regardless of > > whether the trip count is spilled. If you have an example where that > > wouldn't work, I would love to see it. > >
On Tue, Oct 21, 2014 at 7:20 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote: > If the tripcount spill issue is not handled in the pattern, ICE may happen then. > Here reload is trying to spill pseudo 173, but a memory operand is not allowed in zero_cost_loop_end pattern. > And this is what I am trying to solve. We have full control of the zero_cost_loop_end pattern. Plus, it doesn't actually generate any real code. Edit it so it can take a memory operand.
> On Tue, Oct 21, 2014 at 7:20 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote: > > If the tripcount spill issue is not handled in the pattern, ICE may happen then. > > Here reload is trying to spill pseudo 173, but a memory operand is not allowed > in zero_cost_loop_end pattern. > > And this is what I am trying to solve. > > We have full control of the zero_cost_loop_end pattern. Plus, it doesn't actually > generate any real code. Edit it so it can take a memory operand. Here the key point is we need a general purpose register for the "loop" instruction. If the trip count register is spilled, we don't have a general purpose register then. And we cannot use zero-cost looping in this situation. And that's why I spilt the zero_cost_loop_end into a normal test and branch.
> > On Tue, Oct 21, 2014 at 7:20 PM, Yangfei (Felix) <felix.yang@huawei.com> > wrote: > > > If the tripcount spill issue is not handled in the pattern, ICE may happen then. > > > Here reload is trying to spill pseudo 173, but a memory operand is > > > not allowed > > in zero_cost_loop_end pattern. > > > And this is what I am trying to solve. > > > > We have full control of the zero_cost_loop_end pattern. Plus, it > > doesn't actually generate any real code. Edit it so it can take a memory > operand. > > Here the key point is we need a general purpose register for the "loop" > instruction. > If the trip count register is spilled, we don't have a general purpose register then. > And we cannot use zero-cost looping in this situation. > And that's why I spilt the zero_cost_loop_end into a normal test and branch. Also note that the hwloop_pattern_reg interface also expects a general purpose register in the doloop_end pattern.
On Thu, Oct 23, 2014 at 9:12 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote: >> Here the key point is we need a general purpose register for the "loop" >> instruction. So the question to ask here is, "How does this work today, without loop instructions?" Somehow--even when it has been spilled--a branch instruction can test the trip count. There should be no difference. >> And we cannot use zero-cost looping in this situation. >> And that's why I spilt the zero_cost_loop_end into a normal test and branch. > Also note that the hwloop_pattern_reg interface also expects a general purpose register in the doloop_end pattern. If there were no loop instruction, how would this work?
> > On Thu, Oct 23, 2014 at 9:12 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote: > >> Here the key point is we need a general purpose register for the "loop" > >> instruction. > > So the question to ask here is, "How does this work today, without loop > instructions?" Somehow--even when it has been spilled--a branch instruction can > test the trip count. There should be no difference. > > >> And we cannot use zero-cost looping in this situation. > >> And that's why I spilt the zero_cost_loop_end into a normal test and branch. > > > Also note that the hwloop_pattern_reg interface also expects a general > purpose register in the doloop_end pattern. > > If there were no loop instruction, how would this work? Just take a look at my patch. I handle this in the new define_split: +(define_split + [(set (pc) + (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "") + (const_int 1)) + (label_ref (match_operand 1 "" "")) + (pc))) + (set (match_operand:SI 2 "nonimmediate_operand" "") + (plus:SI (match_dup 0) + (const_int -1))) + (unspec [(const_int 0)] UNSPEC_LSETUP_END) + (clobber (match_scratch 3))] + "TARGET_LOOPS && optimize && reload_completed" + [(const_int 0)] +{ + if (!REG_P (operands[0])) + { + rtx test; + + /* Fallback into a normal conditional branch insn. */ + emit_move_insn (operands[3], operands[0]); + emit_insn (gen_addsi3 (operands[3], operands[3], constm1_rtx)); + emit_move_insn (operands[0], operands[3]); + test = gen_rtx_NE (VOIDmode, operands[3], const0_rtx); + emit_jump_insn (gen_cbranchsi4 (test, operands[3], + const0_rtx, operands[1])); + } + else + { + emit_jump_insn (gen_loop_end (operands[0], operands[1], operands[2])); + } + + DONE; +})
I mean without your patch at all. On Thu, Oct 23, 2014 at 11:30 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote: >> >> On Thu, Oct 23, 2014 at 9:12 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote: >> >> Here the key point is we need a general purpose register for the "loop" >> >> instruction. >> >> So the question to ask here is, "How does this work today, without loop >> instructions?" Somehow--even when it has been spilled--a branch instruction can >> test the trip count. There should be no difference. >> >> >> And we cannot use zero-cost looping in this situation. >> >> And that's why I spilt the zero_cost_loop_end into a normal test and branch. >> >> > Also note that the hwloop_pattern_reg interface also expects a general >> purpose register in the doloop_end pattern. >> >> If there were no loop instruction, how would this work? > > > Just take a look at my patch. I handle this in the new define_split: > > +(define_split > + [(set (pc) > + (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "") > + (const_int 1)) > + (label_ref (match_operand 1 "" "")) > + (pc))) > + (set (match_operand:SI 2 "nonimmediate_operand" "") > + (plus:SI (match_dup 0) > + (const_int -1))) > + (unspec [(const_int 0)] UNSPEC_LSETUP_END) > + (clobber (match_scratch 3))] > + "TARGET_LOOPS && optimize && reload_completed" > + [(const_int 0)] > +{ > + if (!REG_P (operands[0])) > + { > + rtx test; > + > + /* Fallback into a normal conditional branch insn. */ > + emit_move_insn (operands[3], operands[0]); > + emit_insn (gen_addsi3 (operands[3], operands[3], constm1_rtx)); > + emit_move_insn (operands[0], operands[3]); > + test = gen_rtx_NE (VOIDmode, operands[3], const0_rtx); > + emit_jump_insn (gen_cbranchsi4 (test, operands[3], > + const0_rtx, operands[1])); > + } > + else > + { > + emit_jump_insn (gen_loop_end (operands[0], operands[1], operands[2])); > + } > + > + DONE; > +})
1. The original xtensa port never generates "loop" instruction at all. 2. A port doesn't need to implement hwloop_pattern_reg hook if it has no zero-cost loop instruction. Is that clear? > > I mean without your patch at all. > > On Thu, Oct 23, 2014 at 11:30 PM, Yangfei (Felix) <felix.yang@huawei.com> > wrote: > >> > >> On Thu, Oct 23, 2014 at 9:12 PM, Yangfei (Felix) <felix.yang@huawei.com> > wrote: > >> >> Here the key point is we need a general purpose register for the "loop" > >> >> instruction. > >> > >> So the question to ask here is, "How does this work today, without > >> loop instructions?" Somehow--even when it has been spilled--a branch > >> instruction can test the trip count. There should be no difference. > >> > >> >> And we cannot use zero-cost looping in this situation. > >> >> And that's why I spilt the zero_cost_loop_end into a normal test and > branch. > >> > >> > Also note that the hwloop_pattern_reg interface also expects a > >> > general > >> purpose register in the doloop_end pattern. > >> > >> If there were no loop instruction, how would this work? > > > > > > Just take a look at my patch. I handle this in the new define_split: > > > > +(define_split > > + [(set (pc) > > + (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" > "") > > + (const_int 1)) > > + (label_ref (match_operand 1 "" "")) > > + (pc))) > > + (set (match_operand:SI 2 "nonimmediate_operand" "") > > + (plus:SI (match_dup 0) > > + (const_int -1))) > > + (unspec [(const_int 0)] UNSPEC_LSETUP_END) > > + (clobber (match_scratch 3))] > > + "TARGET_LOOPS && optimize && reload_completed" > > + [(const_int 0)] > > +{ > > + if (!REG_P (operands[0])) > > + { > > + rtx test; > > + > > + /* Fallback into a normal conditional branch insn. */ > > + emit_move_insn (operands[3], operands[0]); > > + emit_insn (gen_addsi3 (operands[3], operands[3], constm1_rtx)); > > + emit_move_insn (operands[0], operands[3]); > > + test = gen_rtx_NE (VOIDmode, operands[3], const0_rtx); > > + emit_jump_insn (gen_cbranchsi4 (test, operands[3], > > + const0_rtx, operands[1])); > > + } > > + else > > + { > > + emit_jump_insn (gen_loop_end (operands[0], operands[1], > operands[2])); > > + } > > + > > + DONE; > > +})
On Thu, Oct 23, 2014 at 11:40 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote: > 1. The original xtensa port never generates "loop" instruction at all. > 2. A port doesn't need to implement hwloop_pattern_reg hook if it has no zero-cost loop instruction. > > Is that clear? We are talking in circles. I understand very well what goes on here. My point is: 1. Right now, today, GCC generates loops with branch instructions even when the trip count is spilled. 2. Branch instructions and loop instructions have identical register requirements. Therefore: 3. loop instructions should be generatable when the trip count is spilled.
Thanks for the explanation. I think I am clear about what you are thinking now. That's an interesting question. I am not sure about reason why GCC's reload cannot handle a doloop_end insn. I guess maybe the doloop_end pattern is special? I mean it's a branch insn in a parallel form. > > On Thu, Oct 23, 2014 at 11:40 PM, Yangfei (Felix) <felix.yang@huawei.com> > wrote: > > 1. The original xtensa port never generates "loop" instruction at all. > > 2. A port doesn't need to implement hwloop_pattern_reg hook if it has no > zero-cost loop instruction. > > > > Is that clear? > > We are talking in circles. I understand very well what goes on here. > > My point is: > > 1. Right now, today, GCC generates loops with branch instructions even when the > trip count is spilled. > 2. Branch instructions and loop instructions have identical register requirements. > > Therefore: > > 3. loop instructions should be generatable when the trip count is spilled.
On Thu, Oct 23, 2014 at 11:51 PM, Yangfei (Felix) <felix.yang@huawei.com> wrote: > Thanks for the explanation. I think I am clear about what you are thinking now. > That's an interesting question. I am not sure about reason why GCC's reload cannot handle a doloop_end insn. > I guess maybe the doloop_end pattern is special? I mean it's a branch insn in a parallel form. No it is not special. Just jump are never handled by reload. I thought this was documented somewhere also. Basically the main issue with jumps is where does the reload value go which side of the jump? Thanks, Andrew > > > >> >> On Thu, Oct 23, 2014 at 11:40 PM, Yangfei (Felix) <felix.yang@huawei.com> >> wrote: >> > 1. The original xtensa port never generates "loop" instruction at all. >> > 2. A port doesn't need to implement hwloop_pattern_reg hook if it has no >> zero-cost loop instruction. >> > >> > Is that clear? >> >> We are talking in circles. I understand very well what goes on here. >> >> My point is: >> >> 1. Right now, today, GCC generates loops with branch instructions even when the >> trip count is spilled. >> 2. Branch instructions and loop instructions have identical register requirements. >> >> Therefore: >> >> 3. loop instructions should be generatable when the trip count is spilled.
> On Thu, Oct 23, 2014 at 11:51 PM, Yangfei (Felix) <felix.yang@huawei.com> > wrote: > > Thanks for the explanation. I think I am clear about what you are thinking now. > > That's an interesting question. I am not sure about reason why GCC's reload > cannot handle a doloop_end insn. > > I guess maybe the doloop_end pattern is special? I mean it's a branch insn in a > parallel form. > > > No it is not special. Just jump are never handled by reload. I thought this was > documented somewhere also. Basically the main issue with jumps is where > does the reload value go which side of the jump? > > Thanks, > Andrew > Hi Sterling, How do you think about this issue? As c6x/bfin port handles this the same way, is it OK for the patch to be applied? Thanks.
Index: gcc/ChangeLog =================================================================== --- gcc/ChangeLog (revision 216079) +++ gcc/ChangeLog (working copy) @@ -1,3 +1,20 @@ +2014-10-10 Felix Yang <felix.yang@huawei.com> + + * config/xtensa/xtensa.h (TARGET_LOOPS): New Macro. + * config/xtensa/xtensa.c (xtensa_reorg): New. + (xtensa_reorg_loops): New. + (xtensa_can_use_doloop_p): New. + (xtensa_invalid_within_doloop): New. + (hwloop_optimize): New. + (hwloop_fail): New. + (hwloop_pattern_reg): New. + (xtensa_emit_loop_end): Modified to emit the zero-overhead loop end label. + (xtensa_doloop_hooks): Define. + * config/xtensa/xtensa.md (doloop_end): New. + (loop_end): New + (zero_cost_loop_start): Rewritten. + (zero_cost_loop_end): Rewritten. + 2014-10-10 Kyrylo Tkachov <kyrylo.tkachov@arm.com> * configure.ac: Add --enable-fix-cortex-a53-835769 option. Index: gcc/config/xtensa/xtensa.md =================================================================== --- gcc/config/xtensa/xtensa.md (revision 216079) +++ gcc/config/xtensa/xtensa.md (working copy) @@ -35,6 +35,8 @@ (UNSPEC_TLS_CALL 9) (UNSPEC_TP 10) (UNSPEC_MEMW 11) + (UNSPEC_LSETUP_START 12) + (UNSPEC_LSETUP_END 13) (UNSPECV_SET_FP 1) (UNSPECV_ENTRY 2) @@ -1289,41 +1291,120 @@ (set_attr "length" "3")]) +;; Zero-overhead looping support. + ;; Define the loop insns used by bct optimization to represent the -;; start and end of a zero-overhead loop (in loop.c). This start -;; template generates the loop insn; the end template doesn't generate -;; any instructions since loop end is handled in hardware. +;; start and end of a zero-overhead loop. This start template generates +;; the loop insn; the end template doesn't generate any instructions since +;; loop end is handled in hardware. (define_insn "zero_cost_loop_start" [(set (pc) - (if_then_else (eq (match_operand:SI 0 "register_operand" "a") - (const_int 0)) - (label_ref (match_operand 1 "" "")) - (pc))) - (set (reg:SI 19) - (plus:SI (match_dup 0) (const_int -1)))] - "" - "loopnez\t%0, %l1" + (if_then_else (ne (match_operand:SI 0 "register_operand" "2") + (const_int 1)) + (label_ref (match_operand 1 "" "")) + (pc))) + (set (match_operand:SI 2 "register_operand" "=a") + (plus (match_dup 0) + (const_int -1))) + (unspec [(const_int 0)] UNSPEC_LSETUP_START)] + "TARGET_LOOPS && optimize" + "loop\t%0, %l1_LEND" [(set_attr "type" "jump") (set_attr "mode" "none") (set_attr "length" "3")]) (define_insn "zero_cost_loop_end" [(set (pc) - (if_then_else (ne (reg:SI 19) (const_int 0)) - (label_ref (match_operand 0 "" "")) - (pc))) - (set (reg:SI 19) - (plus:SI (reg:SI 19) (const_int -1)))] - "" + (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "2,2") + (const_int 1)) + (label_ref (match_operand 1 "" "")) + (pc))) + (set (match_operand:SI 2 "nonimmediate_operand" "=a,m") + (plus (match_dup 0) + (const_int -1))) + (unspec [(const_int 0)] UNSPEC_LSETUP_END) + (clobber (match_scratch:SI 3 "=X,&r"))] + "TARGET_LOOPS && optimize" + "#" + [(set_attr "type" "jump") + (set_attr "mode" "none") + (set_attr "length" "0")]) + +(define_insn "loop_end" + [(set (pc) + (if_then_else (ne (match_operand:SI 0 "register_operand" "2") + (const_int 1)) + (label_ref (match_operand 1 "" "")) + (pc))) + (set (match_operand:SI 2 "register_operand" "=a") + (plus (match_dup 0) + (const_int -1))) + (unspec [(const_int 0)] UNSPEC_LSETUP_END)] + "TARGET_LOOPS && optimize" { - xtensa_emit_loop_end (insn, operands); - return ""; + xtensa_emit_loop_end (insn, operands); + return ""; } [(set_attr "type" "jump") (set_attr "mode" "none") (set_attr "length" "0")]) +(define_split + [(set (pc) + (if_then_else (ne (match_operand:SI 0 "nonimmediate_operand" "") + (const_int 1)) + (label_ref (match_operand 1 "" "")) + (pc))) + (set (match_operand:SI 2 "nonimmediate_operand" "") + (plus:SI (match_dup 0) + (const_int -1))) + (unspec [(const_int 0)] UNSPEC_LSETUP_END) + (clobber (match_scratch 3))] + "TARGET_LOOPS && optimize && reload_completed" + [(const_int 0)] +{ + if (!REG_P (operands[0])) + { + rtx test; + + /* Fallback into a normal conditional branch insn. */ + emit_move_insn (operands[3], operands[0]); + emit_insn (gen_addsi3 (operands[3], operands[3], constm1_rtx)); + emit_move_insn (operands[0], operands[3]); + test = gen_rtx_NE (VOIDmode, operands[3], const0_rtx); + emit_jump_insn (gen_cbranchsi4 (test, operands[3], + const0_rtx, operands[1])); + } + else + { + emit_jump_insn (gen_loop_end (operands[0], operands[1], operands[2])); + } + + DONE; +}) + +; operand 0 is the loop count pseudo register +; operand 1 is the label to jump to at the top of the loop +(define_expand "doloop_end" + [(parallel [(set (pc) (if_then_else + (ne (match_operand:SI 0 "" "") + (const_int 1)) + (label_ref (match_operand 1 "" "")) + (pc))) + (set (match_dup 0) + (plus:SI (match_dup 0) + (const_int -1))) + (unspec [(const_int 0)] UNSPEC_LSETUP_END) + (clobber (match_dup 2))])] ; match_scratch + "TARGET_LOOPS && optimize" +{ + /* The loop optimizer doesn't check the predicates... */ + if (GET_MODE (operands[0]) != SImode) + FAIL; + operands[2] = gen_rtx_SCRATCH (SImode); +}) + ;; Setting a register from a comparison. Index: gcc/config/xtensa/xtensa.c =================================================================== --- gcc/config/xtensa/xtensa.c (revision 216079) +++ gcc/config/xtensa/xtensa.c (working copy) @@ -61,6 +61,8 @@ along with GCC; see the file COPYING3. If not see #include "gimplify.h" #include "df.h" #include "builtins.h" +#include "dumpfile.h" +#include "hw-doloop.h" /* Enumeration for all of the relational tests, so that we can build @@ -186,6 +188,10 @@ static reg_class_t xtensa_secondary_reload (bool, static bool constantpool_address_p (const_rtx addr); static bool xtensa_legitimate_constant_p (enum machine_mode, rtx); +static void xtensa_reorg (void); +static bool xtensa_can_use_doloop_p (const widest_int &, const widest_int &, + unsigned int, bool); +static const char *xtensa_invalid_within_doloop (const rtx_insn *); static bool xtensa_member_type_forces_blk (const_tree, enum machine_mode mode); @@ -312,6 +318,15 @@ static const int reg_nonleaf_alloc_order[FIRST_PSE #undef TARGET_LEGITIMATE_CONSTANT_P #define TARGET_LEGITIMATE_CONSTANT_P xtensa_legitimate_constant_p +#undef TARGET_MACHINE_DEPENDENT_REORG +#define TARGET_MACHINE_DEPENDENT_REORG xtensa_reorg + +#undef TARGET_CAN_USE_DOLOOP_P +#define TARGET_CAN_USE_DOLOOP_P xtensa_can_use_doloop_p + +#undef TARGET_INVALID_WITHIN_DOLOOP +#define TARGET_INVALID_WITHIN_DOLOOP xtensa_invalid_within_doloop + struct gcc_target targetm = TARGET_INITIALIZER; @@ -1676,7 +1691,7 @@ xtensa_emit_loop_end (rtx_insn *insn, rtx *operand } } - output_asm_insn ("# loop end for %0", operands); + output_asm_insn ("%1_LEND:", operands); } @@ -3712,4 +3727,236 @@ xtensa_legitimate_constant_p (enum machine_mode mo return !xtensa_tls_referenced_p (x); } +/* Implement TARGET_CAN_USE_DOLOOP_P. */ + +static bool +xtensa_can_use_doloop_p (const widest_int &, const widest_int &, + unsigned int loop_depth, bool entered_at_top) +{ + /* Considering limitations in the hardware, only use doloop + for innermost loops which must be entered from the top. */ + if (loop_depth > 1 || !entered_at_top) + return false; + + return true; +} + +/* NULL if INSN insn is valid within a low-overhead loop. + Otherwise return why doloop cannot be applied. */ + +static const char * +xtensa_invalid_within_doloop (const rtx_insn *insn) +{ + if (CALL_P (insn)) + return "Function call in the loop."; + + if (JUMP_P (insn) && INSN_CODE (insn) == CODE_FOR_return) + return "Return from a call instruction in the loop."; + + return NULL; +} + +/* Optimize LOOP. */ + +static bool +hwloop_optimize (hwloop_info loop) +{ + int i; + edge entry_edge; + basic_block entry_bb; + rtx iter_reg; + rtx_insn *insn, *seq, *entry_after; + + if (loop->depth > 1) + { + if (dump_file) + fprintf (dump_file, ";; loop %d is not innermost\n", + loop->loop_no); + return false; + } + + if (!loop->incoming_dest) + { + if (dump_file) + fprintf (dump_file, ";; loop %d has more than one entry\n", + loop->loop_no); + return false; + } + + if (loop->incoming_dest != loop->head) + { + if (dump_file) + fprintf (dump_file, ";; loop %d is not entered from head\n", + loop->loop_no); + return false; + } + + if (loop->has_call || loop->has_asm) + { + if (dump_file) + fprintf (dump_file, ";; loop %d has invalid insn\n", + loop->loop_no); + return false; + } + + /* Scan all the blocks to make sure they don't use iter_reg. */ + if (loop->iter_reg_used || loop->iter_reg_used_outside) + { + if (dump_file) + fprintf (dump_file, ";; loop %d uses iterator\n", + loop->loop_no); + return false; + } + + /* Check if start_label appears before doloop_end. */ + insn = loop->start_label; + while (insn && insn != loop->loop_end) + insn = NEXT_INSN (insn); + + if (!insn) + { + if (dump_file) + fprintf (dump_file, ";; loop %d start_label not before loop_end\n", + loop->loop_no); + return false; + } + + /* Get the loop iteration register. */ + iter_reg = loop->iter_reg; + + gcc_assert (REG_P (iter_reg)); + + entry_edge = NULL; + + FOR_EACH_VEC_SAFE_ELT (loop->incoming, i, entry_edge) + if (entry_edge->flags & EDGE_FALLTHRU) + break; + + if (entry_edge == NULL) + return false; + + /* Place the zero_cost_loop_start instruction before the loop. */ + entry_bb = entry_edge->src; + + start_sequence (); + + insn = emit_insn (gen_zero_cost_loop_start (loop->iter_reg, + loop->start_label, + loop->iter_reg)); + + seq = get_insns (); + + if (!single_succ_p (entry_bb) || vec_safe_length (loop->incoming) > 1) + { + basic_block new_bb; + edge e; + edge_iterator ei; + + emit_insn_before (seq, BB_HEAD (loop->head)); + seq = emit_label_before (gen_label_rtx (), seq); + new_bb = create_basic_block (seq, insn, entry_bb); + FOR_EACH_EDGE (e, ei, loop->incoming) + { + if (!(e->flags & EDGE_FALLTHRU)) + redirect_edge_and_branch_force (e, new_bb); + else + redirect_edge_succ (e, new_bb); + } + + make_edge (new_bb, loop->head, 0); + } + else + { + entry_after = BB_END (entry_bb); + while (DEBUG_INSN_P (entry_after) + || (NOTE_P (entry_after) + && NOTE_KIND (entry_after) != NOTE_INSN_BASIC_BLOCK)) + entry_after = PREV_INSN (entry_after); + + emit_insn_after (seq, entry_after); + } + + end_sequence (); + + return true; +} + +/* A callback for the hw-doloop pass. Called when a loop we have discovered + turns out not to be optimizable; we have to split the loop_end pattern into + a subtract and a test. */ + +static void +hwloop_fail (hwloop_info loop) +{ + rtx test; + rtx_insn *insn = loop->loop_end; + + emit_insn_before (gen_addsi3 (loop->iter_reg, + loop->iter_reg, + constm1_rtx), + loop->loop_end); + + test = gen_rtx_NE (VOIDmode, loop->iter_reg, const0_rtx); + insn = emit_jump_insn_before (gen_cbranchsi4 (test, + loop->iter_reg, const0_rtx, + loop->start_label), + loop->loop_end); + + JUMP_LABEL (insn) = loop->start_label; + LABEL_NUSES (loop->start_label)++; + delete_insn (loop->loop_end); +} + +/* A callback for the hw-doloop pass. This function examines INSN; if + it is a doloop_end pattern we recognize, return the reg rtx for the + loop counter. Otherwise, return NULL_RTX. */ + +static rtx +hwloop_pattern_reg (rtx_insn *insn) +{ + rtx reg; + + if (!JUMP_P (insn) || recog_memoized (insn) != CODE_FOR_loop_end) + return NULL_RTX; + + reg = SET_DEST (XVECEXP (PATTERN (insn), 0, 1)); + if (!REG_P (reg)) + return NULL_RTX; + + return reg; +} + + +static struct hw_doloop_hooks xtensa_doloop_hooks = +{ + hwloop_pattern_reg, + hwloop_optimize, + hwloop_fail +}; + +/* Run from machine_dependent_reorg, this pass looks for doloop_end insns + and tries to rewrite the RTL of these loops so that proper Xtensa + hardware loops are generated. */ + +static void +xtensa_reorg_loops (void) +{ + reorg_loops (false, &xtensa_doloop_hooks); +} + +/* Implement the TARGET_MACHINE_DEPENDENT_REORG pass. */ + +static void +xtensa_reorg (void) +{ + /* We are freeing block_for_insn in the toplev to keep compatibility + with old MDEP_REORGS that are not CFG based. Recompute it now. */ + compute_bb_for_insn (); + + df_analyze (); + + /* Doloop optimization. */ + xtensa_reorg_loops (); +} + #include "gt-xtensa.h" Index: gcc/config/xtensa/xtensa.h =================================================================== --- gcc/config/xtensa/xtensa.h (revision 216079) +++ gcc/config/xtensa/xtensa.h (working copy) @@ -61,6 +61,7 @@ extern unsigned xtensa_current_frame_size; #define TARGET_S32C1I XCHAL_HAVE_S32C1I #define TARGET_ABSOLUTE_LITERALS XSHAL_USE_ABSOLUTE_LITERALS #define TARGET_THREADPTR XCHAL_HAVE_THREADPTR +#define TARGET_LOOPS XCHAL_HAVE_LOOPS #define TARGET_DEFAULT \ ((XCHAL_HAVE_L32R ? 0 : MASK_CONST16) | \