Patchwork [5/9,SMS] Support new loop pattern

login
register
mail settings
Submitter zhroma@ispras.ru
Date Sept. 30, 2011, 3:22 p.m.
Message ID <CAJnFk2eyzXeRGGuY=re=n=_TT1bPvrWLgxNSGYzRaoPfLLOTaA@mail.gmail.com>
Download mbox | patch
Permalink /patch/117156/
State New
Headers show

Comments

zhroma@ispras.ru - Sept. 30, 2011, 3:22 p.m.
2011/7/21  <zhroma@ispras.ru>:
> This patch should be applied only after pending patches by Revital.


Ping. New version is attached, it suits current trunk without
additional patches.
Also this related patch needs approval:
http://gcc.gnu.org/ml/gcc-patches/2011-07/msg01804.html

> The loop should meet the following requirements.
> First three are the same as for loop with doloop pattern:
> ...
> The next three describe the control part of new supported loops.
> - the last jump instruction should look like:  pc=(regF!=0)?label:pc, regF is
>  flag register;
> - the last instruction which sets regF should be: regF=COMPARE(regC,X), where X
>  is a constant, or maybe a register, which is not changed inside a loop;
> - only one instruction modifies regC inside a loop (other can use regC, but not
>  write), and it should simply adjust it by a constant: regC=regC+step, where
>  step is a constant.

> When doloop is succesfully scheduled by SMS, its number of
> iterations of loop kernel should be decreased by the number of stages in a
> schedule minus one, while other iterations expand to prologue and epilogue.
> In new supported loops such approach can't be used, because some
> instructions can use count register (regC).  Instead of this,
> the final register value X in compare instruction regF=COMPARE(regC,X)
> is changed to another value Y respective to the stage this instruction
> is scheduled (Y = X - stage * step).

The main difference from doloop case is that regC can be used by some
instructions in loop body.
That's why we are unable to simply adjust regC initial value, but have
to keep it's value correct on each particular iteration.
So, we change comparison instruction accordingly.

An example:
int a[100];
int main()
{
  int i;
  for (i = 85; i > 12; i -= 5)
      a[i] = i * i;
  return a[15]-225;
}
ARM assembler with "-O2 -fno-auto-inc-dec":
        ldr     r0, .L5
        mov     r3, #85
        mov     r2, r0
.L2:
        mul     r1, r3, r3
        sub     r3, r3, #5
        cmp     r3, #10
        str     r1, [r2, #340]
        sub     r2, r2, #20
        bne     .L2
        ldr     r0, [r0, #60]
        sub     r0, r0, #225
        bx      lr
.L5:
        .word   a

Loop body is executed 15 times.
When compiling with SMS, it finds a schedule with ii=7, stage_count=3
and following times:
Stage  Time       Insn
0          5      mul     r1, r3, r3
1         10     sub     r3, r3, #5
1         11     cmp     r3, #10
1         11     str     r1, [r2, #340]
1         13     bne     .L2
2         16     sub     r2, r2, #20

To make new schedule correct the loop body
should be executed 14 times and we change compare instruction:
regF=COMPARE(regC,X) to regF=COMPARE(regC,Y) where Y = X - stage * step.
In our example regC is r3, X is 10, step = -5, compare instruction
is scheduled on stage 1, so it should be Y = 10 - 1 * (-5) = 15.

So, after SMS it looks like:
        ldr     r0, .L5
        mov     r3, #85
        mov     r2, r0
;;prologue
        mul     r1, r3, r3      ;;from stage 0 first iteration
        sub     r3, r3, #5      ;;3 insns from stage 1 first iteration
        cmp     r3, #10
        str     r1, [r2, #340]
        mul     r1, r3, r3      ;;from stage 0 second iteration
;;body
.L2:
        sub     r3, r3, #5
        sub     r2, r2, #20
        cmp     r3, #15         ;; new value to compare with is Y=15
        str     r1, [r2, #340]
        mul     r1, r3, r3
        bne     .L2
;;epilogue
        sub     r2, r2, #20     ;;from stage 2 pre-last iteration
        sub     r3, r3, #5      ;;3 insns from stage 1 last iteration
        cmp     r3, #10
        str     r1, [r2, #340]
        sub     r2, r2, #20     ;;from stage 2 last iteration

        ldr     r0, [r0, #60]
        sub     r0, r0, #225
        bx      lr
.L5:
        .word   a

Real ARM assembler with SMS (after some optimizations and without dead code):
        mov     r3, #85
        ldr     r0, .L8
        mul     r1, r3, r3
        sub     r3, r3, #5
        mov     r2, r0
        str     r1, [r0, #340]
        mul     r1, r3, r3
.L2:
        sub     r3, r3, #5
        sub     r2, r2, #20
        cmp     r3, #15
        str     r1, [r2, #340]
        mul     r1, r3, r3
        bne     .L2
        str     r1, [r2, #320]
        ldr     r0, [r0, #60]
        sub     r0, r0, #225
        bx      lr
.L8:
        .word   a

>
> Testing of this appoach reveals two bugs, which do not appear while SMS was
> used only for doloop loops.  Both these bugs happen due to the nature of the
> flag register.  On x86_64 it is clobbered by most of arithmetic instructions.
> The following situation happens when SMS is enabled without register renaming
> (-fno-modulo-sched-allow-regmoves).  When data dependency graph is built, there
> is a step when we generate anti-dependencies from last register use to first
> write of this register at the next iteration.  At this moment we should also
> create such dependencies to all instructions which clobber the register to
> prevent this clobbers being before last use is new schedule.
>
> Here is an model of example:
>
> loop {
> set1 regR
> use1 regR
> clobber regR
> set2 regR
> use2 regR
> }
>
> If we create only use2->set1 anti-dependency (and no use2->cloober) the
> following wrong schedule is possible:
>
> prologue {
> set1 regR
> use1 regR
> clobber regR
> }
> kernel {
> set2 regR
> clobber regR (instruction from next iteration in terms of old loop kernel)
> use2 regR
> set1 regR (also from next iteration)
> use1 regR (also from next iteration)
> }
> epilogue {
> set2 regR
> use2 regR
> }
>
> This problem was succesfully fixed by creating a vector of all clobbering
> instructions together with first write and adding all needed dependencies.
>
> The other bug happens only with -fmodulo-sched-allow-regmoves.  Here we
> eliminate some anti-dependence edges in data dependency graph in order to
> resolve them later by adding some register moves (renaming instructions).  But
> in situation as in example above compiler gives an ICE because it can't create
> a register move, when regR is hardware flag register.  So we have to know which
> register(s) cause anti-dependency in order to understand whether we can ignore
> it.  I can't find any easy way to gather this information, so I create my own
> structures to store this info and had implemented my own hooks for
> sched_analyze function.  This leads to more complex interconnection between
> ddg.c and modulo-sched.c.
>
> One more thing to point out is number of loop iterations. When number of
> iterations of a loop is not known at compile time, SMS has to create two loop
> versions (original and scheduled), and execute scheduled one only when real
> number of iterations is bigger than number of stages.  In doloop case the
> number of iterations simply equals to the count register value before the loop.
> So SMS finds its constant initialization or makes two loop versions.  In new
> supported loops number of iterations value is more complex.  It even can't be
> calculated as (final_reg_value-start_reg_value)/step because of examples like
> this:
>
> for (unsigned int x = 0x0; x != 0x6F80919A; x += 0xEDCBA987) ...;
>
> This loop has 22 iterations.  So, i decided to use get_simple_loop_desc
> function which gives a structure with loop characteristics, some of them helps
> to find iteration number:
>
> rtx niter_expr - The number of iterations of the loop;
> bool const_iter - True if the loop iterates the constant number of times;
> unsigned HOST_WIDEST_INT niter - Number of iterations if constant;
>
> But we can use these expressions only after looking through some other fields
> of returned structure:
>
> bool simple_p - True if we are able to say anything about number of iterations
> of the loop;
> rtx assumptions - Assumptions under that the rest of the information is valid;
> rtx noloop_assumptions - Assumptions under which the loop ends before reaching
> the latch;
> rtx infinite - Condition under which the loop is infinite.
>
> I decide to allow SMS scheduling only when simple_p is true and other three
> fields are NULL_RTX, or when simple_p is true and
> flag_unsafe_loop_optimizations is set.  One more exception is infinite
> condition, and the next separate patch is an attempt to process it.
>
--
Roman Zhuykov
zhroma@ispras.ru
Ayal Zaks - Oct. 12, 2011, 12:22 a.m.
On Fri, Sep 30, 2011 at 5:22 PM, Roman Zhuykov <zhroma@ispras.ru> wrote:
> 2011/7/21  <zhroma@ispras.ru>:
>> This patch should be applied only after pending patches by Revital.
>
>
> Ping. New version is attached, it suits current trunk without
> additional patches.

Thanks for the ping.

> Also this related patch needs approval:
> http://gcc.gnu.org/ml/gcc-patches/2011-07/msg01804.html
>
>> The loop should meet the following requirements.
>> First three are the same as for loop with doloop pattern:
>> ...
>> The next three describe the control part of new supported loops.
>> - the last jump instruction should look like:  pc=(regF!=0)?label:pc, regF is

you'd probably want to bump to next instruction if falling through,
e.g., pc=(regF!=0)?label:pc+4

>>  flag register;
>> - the last instruction which sets regF should be: regF=COMPARE(regC,X), where X
>>  is a constant, or maybe a register, which is not changed inside a loop;
>> - only one instruction modifies regC inside a loop (other can use regC, but not
>>  write), and it should simply adjust it by a constant: regC=regC+step, where
>>  step is a constant.
>
>> When doloop is succesfully scheduled by SMS, its number of
>> iterations of loop kernel should be decreased by the number of stages in a
>> schedule minus one, while other iterations expand to prologue and epilogue.
>> In new supported loops such approach can't be used, because some
>> instructions can use count register (regC).  Instead of this,
>> the final register value X in compare instruction regF=COMPARE(regC,X)
>> is changed to another value Y respective to the stage this instruction
>> is scheduled (Y = X - stage * step).

making sure this does not underflow; i.e., that the number of
iterations is no less than stage (you've addressed this towards the
end below).

>
> The main difference from doloop case is that regC can be used by some
> instructions in loop body.
> That's why we are unable to simply adjust regC initial value, but have
> to keep it's value correct on each particular iteration.
> So, we change comparison instruction accordingly.
>
> An example:
> int a[100];
> int main()
> {
>  int i;
>  for (i = 85; i > 12; i -= 5)
>      a[i] = i * i;
>  return a[15]-225;
> }
> ARM assembler with "-O2 -fno-auto-inc-dec":
>        ldr     r0, .L5
>        mov     r3, #85
>        mov     r2, r0
> .L2:
>        mul     r1, r3, r3
>        sub     r3, r3, #5
>        cmp     r3, #10
>        str     r1, [r2, #340]
>        sub     r2, r2, #20
>        bne     .L2
>        ldr     r0, [r0, #60]
>        sub     r0, r0, #225
>        bx      lr
> .L5:
>        .word   a
>
> Loop body is executed 15 times.
> When compiling with SMS, it finds a schedule with ii=7, stage_count=3
> and following times:
> Stage  Time       Insn
> 0          5      mul     r1, r3, r3
> 1         10     sub     r3, r3, #5
> 1         11     cmp     r3, #10
> 1         11     str     r1, [r2, #340]
> 1         13     bne     .L2
> 2         16     sub     r2, r2, #20
>

branch is not scheduled last?

> To make new schedule correct the loop body
> should be executed 14 times and we change compare instruction:

the loop itself should execute 13 times.

> regF=COMPARE(regC,X) to regF=COMPARE(regC,Y) where Y = X - stage * step.
> In our example regC is r3, X is 10, step = -5, compare instruction
> is scheduled on stage 1, so it should be Y = 10 - 1 * (-5) = 15.
>

right. In general, if the compare is on stage s (starting from 0), it
will be executed s times in the epilog, so it should exit the loop
upon reaching Y = X - s * step.

> So, after SMS it looks like:
>        ldr     r0, .L5
>        mov     r3, #85
>        mov     r2, r0
> ;;prologue
>        mul     r1, r3, r3      ;;from stage 0 first iteration
>        sub     r3, r3, #5      ;;3 insns from stage 1 first iteration
>        cmp     r3, #10
>        str     r1, [r2, #340]
>        mul     r1, r3, r3      ;;from stage 0 second iteration
> ;;body
> .L2:
>        sub     r3, r3, #5
>        sub     r2, r2, #20
>        cmp     r3, #15         ;; new value to compare with is Y=15
>        str     r1, [r2, #340]
>        mul     r1, r3, r3
>        bne     .L2
> ;;epilogue
>        sub     r2, r2, #20     ;;from stage 2 pre-last iteration
>        sub     r3, r3, #5      ;;3 insns from stage 1 last iteration
>        cmp     r3, #10
>        str     r1, [r2, #340]
>        sub     r2, r2, #20     ;;from stage 2 last iteration
>
>        ldr     r0, [r0, #60]
>        sub     r0, r0, #225
>        bx      lr
> .L5:
>        .word   a
>
> Real ARM assembler with SMS (after some optimizations and without dead code):
>        mov     r3, #85
>        ldr     r0, .L8
>        mul     r1, r3, r3
>        sub     r3, r3, #5
>        mov     r2, r0
>        str     r1, [r0, #340]
>        mul     r1, r3, r3
> .L2:
>        sub     r3, r3, #5
>        sub     r2, r2, #20
>        cmp     r3, #15
>        str     r1, [r2, #340]
>        mul     r1, r3, r3
>        bne     .L2
>        str     r1, [r2, #320]
>        ldr     r0, [r0, #60]
>        sub     r0, r0, #225
>        bx      lr
> .L8:
>        .word   a
>
>>
>> Testing of this appoach reveals two bugs, which do not appear while SMS was
>> used only for doloop loops.  Both these bugs happen due to the nature of the
>> flag register.  On x86_64 it is clobbered by most of arithmetic instructions.
>> The following situation happens when SMS is enabled without register renaming
>> (-fno-modulo-sched-allow-regmoves).  When data dependency graph is built, there
>> is a step when we generate anti-dependencies from last register use to first
>> write of this register at the next iteration.

is a step when we generate anti-dependencies from all last registers
uses (i.e., of last register def) to first write of this register at
the next iteration.

>> At this moment we should also
>> create such dependencies to all instructions which clobber the register to
>> prevent this clobbers being before last use is new schedule.
>>

well, we simply need to connect these last uses to either the first
write *or* the first clobber of this register at the next iteration,
according to whichever is first, no?

>> Here is an model of example:
>>
>> loop {
>> set1 regR
>> use1 regR
>> clobber regR
>> set2 regR
>> use2 regR
>> }
>>
>> If we create only use2->set1 anti-dependency (and no use2->cloober) the
>> following wrong schedule is possible:
>>
>> prologue {
>> set1 regR
>> use1 regR
>> clobber regR
>> }
>> kernel {
>> set2 regR
>> clobber regR (instruction from next iteration in terms of old loop kernel)
>> use2 regR
>> set1 regR (also from next iteration)
>> use1 regR (also from next iteration)
>> }
>> epilogue {
>> set2 regR
>> use2 regR
>> }
>>

strange; according to prolog (and epilog), clobber should appear after
use1 in the kernel, no? Aren't there (intra-iteration) dependencies
preventing the clobber to skip over use1 and/or set1?

>> This problem was succesfully fixed by creating a vector of all clobbering
>> instructions together with first write and adding all needed dependencies.
>>

seems like an overkill to me; we didn't draw an edge between every
last use and every write, because writes are kept in order by having
output dependences between them. So should be the case with clobbers.

This should ideally be solved by a dedicated (separate) patch.

>> The other bug happens only with -fmodulo-sched-allow-regmoves.  Here we
>> eliminate some anti-dependence edges in data dependency graph in order to
>> resolve them later by adding some register moves (renaming instructions).  But
>> in situation as in example above compiler gives an ICE because it can't create
>> a register move, when regR is hardware flag register.  So we have to know which
>> register(s) cause anti-dependency in order to understand whether we can ignore
>> it.  I can't find any easy way to gather this information, so I create my own
>> structures to store this info and had implemented my own hooks for
>> sched_analyze function.  This leads to more complex interconnection between
>> ddg.c and modulo-sched.c.

Well, having ddg.c's/create_ddg_dep_from_intra_loop_link() consult
"Register sets from modulo scheduler structures" to determine if an
anti-dep can be eliminated, seems awkward. One should be able to build
a ddg independent of any modulo scheduler structures.
One simple solution is to keep all anti-dep edges of registers that
are clobbered anywhere in the loop. This might be overly conservative,
but perhaps not so if "On x86_64 it is clobbered by most of arithmetic
instructions".
If an anti-dep between a use and a dep of a clobber register is to be
removed, a dependence should be created between the use and a
clobbering instruction following the def. Hope this is clear.

This too should be solved by a dedicated (separate) patch, for easier digestion.

Presumably, the ddg already has all intra-iteration edges preventing
motion of clobbering instructions within an iteration, and we are only
missing inter-iteration deps or deps surfaced by eliminating
anti-deps, right?

You might consider introducing a new type of dependence for such
edges, "Clobber", if it would help.

>>
>> One more thing to point out is number of loop iterations. When number of
>> iterations of a loop is not known at compile time, SMS has to create two loop
>> versions (original and scheduled), and execute scheduled one only when real
>> number of iterations is bigger than number of stages.  In doloop case the
>> number of iterations simply equals to the count register value before the loop.
>> So SMS finds its constant initialization or makes two loop versions.  In new
>> supported loops number of iterations value is more complex.  It even can't be
>> calculated as (final_reg_value-start_reg_value)/step because of examples like
>> this:
>>
>> for (unsigned int x = 0x0; x != 0x6F80919A; x += 0xEDCBA987) ...;
>>
>> This loop has 22 iterations.  So, i decided to use get_simple_loop_desc
>> function which gives a structure with loop characteristics, some of them helps
>> to find iteration number:
>>
>> rtx niter_expr - The number of iterations of the loop;
>> bool const_iter - True if the loop iterates the constant number of times;
>> unsigned HOST_WIDEST_INT niter - Number of iterations if constant;
>>
>> But we can use these expressions only after looking through some other fields
>> of returned structure:
>>
>> bool simple_p - True if we are able to say anything about number of iterations
>> of the loop;
>> rtx assumptions - Assumptions under that the rest of the information is valid;
>> rtx noloop_assumptions - Assumptions under which the loop ends before reaching
>> the latch;
>> rtx infinite - Condition under which the loop is infinite.
>>
>> I decide to allow SMS scheduling only when simple_p is true and other three
>> fields are NULL_RTX, or when simple_p is true and
>> flag_unsafe_loop_optimizations is set.  One more exception is infinite
>> condition, and the next separate patch is an attempt to process it.
>>

ok, still need to go over this rather lengthy and orthogonal (although
it exposes the bugs above) piece.

Ayal.


> --
> Roman Zhuykov
> zhroma@ispras.ru
>

Patch

2011-09-30  Roman Zhuykov  <zhroma@ispras.ru>
	* ddg.c: New VEC.
	(create_ddg_dep_from_intra_loop_link): Use information about register
	uses and sets to determine correctly whether anti-dependency
	can be ignored.
	(add_cross_iteration_register_deps): Store information about
	all clobbers and first write to a register.  Use collected
	information to create anti-dependencies from last use.
	(build_intra_loop_deps): Add call to sms_sched_analyze_init.
	* modulo-sched.c: Include pointer-set.h.
	(old_sched_deps_info): New structure.
	(regset_pair): New type.
	(insn_map): Declare.
	(curr_insn): Ditto.
	(regset_pair_init, destroy_regset_pair, sms_start_insn,
	sms_finish_insn, sms_note_reg_set, sms_note_reg_clobber
	sms_note_reg_use, extract_from_insn_map,
	sms_sched_analyze_init, sms_create_ddg_finish): New functions.
	(sms_sched_deps_info): Add new callbacks.
	(nondoloop_register_get): New function.
	(const_iteration_count): Rename to ...
	(search_const_init): ...this.  Add new parameter (is_const).  Always
	return register initialization rtx and set is_const to true
	only when it is constant.
	(duplicate_insns_of_cycles): Add new parameter (doloop_p).  Do not
	duplicate instructions with count_reg only when doloop_p is set.
	Update all callers.
	(generate_prolog_epilog): Add new parameters.  Correctly generate loop
	prologue for new loop pattern.
	(sms_schedule): Support new loop pattern.
	* sched-int.h (sms_sched_analyze_init, extract_from_insn_map): Export.
---
 gcc/ddg.c          |  131 +++++++---
 gcc/modulo-sched.c |  763 ++++++++++++++++++++++++++++++++++++++++++++++------
 gcc/sched-int.h    |    2 +
 3 files changed, 780 insertions(+), 116 deletions(-)

diff --git a/gcc/ddg.c b/gcc/ddg.c
index 856fa4e..b80bd2d 100644
--- a/gcc/ddg.c
+++ b/gcc/ddg.c
@@ -49,6 +49,10 @@  along with GCC; see the file COPYING3.  If not see
 /* A flag indicating that a ddg edge belongs to an SCC or not.  */
 enum edge_flag {NOT_IN_SCC = 0, IN_SCC};
 
+/* A vector of dependencies needed while processing clobbers.  */
+DEF_VEC_P(df_ref);
+DEF_VEC_ALLOC_P(df_ref,heap);
+
 /* Forward declarations.  */
 static void add_backarc_to_ddg (ddg_ptr, ddg_edge_ptr);
 static void add_backarc_to_scc (ddg_scc_ptr, ddg_edge_ptr);
@@ -178,23 +182,45 @@  create_ddg_dep_from_intra_loop_link (ddg_ptr g, ddg_node_ptr src_node,
      whose register has multiple defs in the loop.  */
   if (flag_modulo_sched_allow_regmoves && (t == ANTI_DEP && dt == REG_DEP))
     {
-      rtx set;
+      bool can_delete_dep = true;
+      unsigned regno;
+      reg_set_iterator rsi;
+      regset src_uses, dest_sets, regs;
+
+      /* Register sets from modulo scheduler structures.  */
+      src_uses = extract_from_insn_map (src_node->insn, 0);
+      dest_sets = extract_from_insn_map (dest_node->insn, 1);
+
+      if (!src_uses || !dest_sets)
+	return;
 
-      set = single_set (dest_node->insn);
-      /* TODO: Handle registers that REG_P is not true for them, i.e.
-         subregs and special registers.  */
-      if (set && REG_P (SET_DEST (set)))
+      /* Build regset intersection.  */
+      regs = ALLOC_REG_SET (&reg_obstack);
+      COPY_REG_SET (regs, src_uses);
+      AND_REG_SET (regs, dest_sets);
+
+      EXECUTE_IF_SET_IN_REG_SET (regs, 0, regno, rsi)
         {
-          int regno = REGNO (SET_DEST (set));
           df_ref first_def;
           struct df_rd_bb_info *bb_info = DF_RD_BB_INFO (g->bb);
 
           first_def = df_bb_regno_first_def_find (g->bb, regno);
           gcc_assert (first_def);
 
-          if (bitmap_bit_p (&bb_info->gen, DF_REF_ID (first_def)))
-            return;
+	  /* CC-flags and other hard registers can't be renamed.
+	     Check whether loop kernel has only one def.  */
+          if (HARD_REGISTER_NUM_P (regno)
+	      || !bitmap_bit_p (&bb_info->gen, DF_REF_ID (first_def)))
+	    {
+	      can_delete_dep = false;
+	      break;
+	    }
         }
+
+      FREE_REG_SET (regs);
+
+      if (can_delete_dep)
+	return;
     }
 
    latency = dep_cost (link);
@@ -247,16 +273,20 @@  create_ddg_dep_no_link (ddg_ptr g, ddg_node_ptr from, ddg_node_ptr to,
 static void
 add_cross_iteration_register_deps (ddg_ptr g, df_ref last_def)
 {
-  int regno = DF_REF_REGNO (last_def);
+  unsigned int regno = DF_REF_REGNO (last_def);
   struct df_link *r_use;
   int has_use_in_bb_p = false;
-  rtx def_insn = DF_REF_INSN (last_def);
+  rtx insn, def_insn = DF_REF_INSN (last_def);
   ddg_node_ptr last_def_node = get_node_of_insn (g, def_insn);
   ddg_node_ptr use_node;
+  df_ref *def_rec;
+  unsigned int uid;
+  static VEC(df_ref,heap) *all_defs;
 #ifdef ENABLE_CHECKING
   struct df_rd_bb_info *bb_info = DF_RD_BB_INFO (g->bb);
 #endif
   df_ref first_def = df_bb_regno_first_def_find (g->bb, regno);
+  bool first_write = true;
 
   gcc_assert (last_def_node);
   gcc_assert (first_def);
@@ -267,6 +297,31 @@  add_cross_iteration_register_deps (ddg_ptr g, df_ref last_def)
 			       DF_REF_ID (first_def)));
 #endif
 
+  all_defs = VEC_alloc (df_ref, heap, 0);
+
+  /* Find all defs which are clobbers and the first normal write def.  */
+  FOR_BB_INSNS (g->bb, insn)
+    {
+      if (!INSN_P (insn))
+        continue;
+      uid = INSN_UID (insn);
+      for (def_rec = DF_INSN_UID_DEFS (uid); *def_rec; def_rec++)
+        {
+          df_ref def = *def_rec;
+          if (DF_REF_REGNO (def) == regno)
+            {
+	      bool is_clobber = DF_REF_FLAGS (def) & (DF_REF_MUST_CLOBBER
+						      | DF_REF_MAY_CLOBBER);
+	      if (is_clobber || first_write)
+	        {
+		  VEC_safe_push (df_ref, heap, all_defs, def);
+		  if (!is_clobber)
+		    first_write = false;
+		}
+	    }
+        }
+    }
+
   /* Create inter-loop true dependences and anti dependences.  */
   for (r_use = DF_REF_CHAIN (last_def); r_use != NULL; r_use = r_use->next)
     {
@@ -290,25 +345,32 @@  add_cross_iteration_register_deps (ddg_ptr g, df_ref last_def)
 	}
       else if (!DEBUG_INSN_P (use_insn))
 	{
+	  unsigned int i;
+	  df_ref curr_def;
 	  /* Add anti deps from last_def's uses in the current iteration
-	     to the first def in the next iteration.  We do not add ANTI
-	     dep when there is an intra-loop TRUE dep in the opposite
-	     direction, but use regmoves to fix such disregarded ANTI
-	     deps when broken.	If the first_def reaches the USE then
-	     there is such a dep.  */
-	  ddg_node_ptr first_def_node = get_node_of_insn (g,
-							  DF_REF_INSN (first_def));
-
-	  gcc_assert (first_def_node);
-
-         /* Always create the edge if the use node is a branch in
-            order to prevent the creation of reg-moves.  */
-          if (DF_REF_ID (last_def) != DF_REF_ID (first_def)
-              || !flag_modulo_sched_allow_regmoves
-	      || JUMP_P (use_node->insn))
-            create_ddg_dep_no_link (g, use_node, first_def_node, ANTI_DEP,
-                                    REG_DEP, 1);
-
+	     to the first def and all clobbers in the next iteration.
+	     We do not add ANTI dep when there is an intra-loop TRUE dep
+	     in the opposite direction, but use regmoves to fix such
+	     disregarded ANTI deps when broken.	If the curr_def reaches
+	     the USE then there is such a dep.  */
+	  FOR_EACH_VEC_ELT (df_ref, all_defs, i, curr_def)
+	    {
+	      if (DF_REF_ID (last_def) != DF_REF_ID (curr_def)
+		  /* Some hard regs (for ex. CC-flags) can't be renamed.  */
+                  || HARD_REGISTER_P (DF_REF_REG (last_def))
+		  || !flag_modulo_sched_allow_regmoves
+		  /* Always create the edge if the use node is a branch in
+		     order to prevent the creation of reg-moves.  */
+		  || (flag_modulo_sched_allow_regmoves
+		      && JUMP_P (use_node->insn)))
+		{
+	          ddg_node_ptr curr_def_node = get_node_of_insn (g,
+						DF_REF_INSN (curr_def));
+		  gcc_assert (curr_def_node);
+		  create_ddg_dep_no_link (g, use_node, curr_def_node,
+					  ANTI_DEP, REG_DEP, 1);
+	        }
+	    }
 	}
     }
   /* Create an inter-loop output dependence between LAST_DEF (which is the
@@ -318,18 +380,15 @@  add_cross_iteration_register_deps (ddg_ptr g, df_ref last_def)
      defs starting with a true dependence to a use which can be in the
      next iteration; followed by an anti dependence of that use to the
      first def (i.e. if there is a use between the two defs.)  */
-  if (!has_use_in_bb_p)
+  if (!has_use_in_bb_p && DF_REF_ID (last_def) != DF_REF_ID (first_def))
     {
-      ddg_node_ptr dest_node;
-
-      if (DF_REF_ID (last_def) == DF_REF_ID (first_def))
-	return;
-
-      dest_node = get_node_of_insn (g, DF_REF_INSN (first_def));
+      ddg_node_ptr dest_node = get_node_of_insn (g, DF_REF_INSN (first_def));
       gcc_assert (dest_node);
       create_ddg_dep_no_link (g, last_def_node, dest_node,
 			      OUTPUT_DEP, REG_DEP, 1);
     }
+
+  VEC_free (df_ref, heap, all_defs);
 }
 /* Build inter-loop dependencies, by looking at DF analysis backwards.  */
 static void
@@ -463,6 +522,8 @@  build_intra_loop_deps (ddg_ptr g)
 
   /* Build the dependence information, using the sched_analyze function.  */
   init_deps_global ();
+  /* Set sms hooks and initialize additional structures.  */
+  sms_sched_analyze_init ();
   init_deps (&tmp_deps, false);
 
   /* Do the intra-block data dependence analysis for the given block.  */
diff --git a/gcc/modulo-sched.c b/gcc/modulo-sched.c
index 28be942..3fed5c2 100644
--- a/gcc/modulo-sched.c
+++ b/gcc/modulo-sched.c
@@ -48,6 +48,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "tree-pass.h"
 #include "dbgcnt.h"
 #include "df.h"
+#include "pointer-set.h"
 
 #ifdef INSN_SCHEDULING
 
@@ -200,9 +201,10 @@  static void set_node_sched_params (ddg_ptr);
 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
 static void permute_partial_schedule (partial_schedule_ptr, rtx);
 static void generate_prolog_epilog (partial_schedule_ptr, struct loop *,
-                                    rtx, rtx);
+                                    rtx, bool, bool, rtx, HOST_WIDEST_INT,
+                                    bool, HOST_WIDEST_INT, rtx *);
 static void duplicate_insns_of_cycles (partial_schedule_ptr,
-				       int, int, int, rtx);
+				       int, int, int, rtx, bool);
 static int calculate_stage_count (partial_schedule_ptr, int);
 static void calculate_must_precede_follow (ddg_node_ptr, int, int,
 					   int, int, sbitmap, sbitmap, sbitmap);
@@ -248,7 +250,162 @@  typedef struct node_sched_params
 } *node_sched_params_ptr;
 
 
-/* The following three functions are copied from the current scheduler
+/* Next data structures and callbacks help to store information about
+   using registers in insn.  */
+static struct sched_deps_info_def old_sched_deps_info;
+
+/* Two regsets to store which registers the insn reads and writes.  */
+typedef struct regset_pair_def
+{
+  regset uses;
+  regset sets;
+} regset_pair;
+
+/* Allocate memory for regset_pair structure.  */
+static regset_pair*
+regset_pair_init (void)
+{
+  regset_pair *trs = (regset_pair *)xcalloc (1, sizeof (regset_pair));
+  trs->uses = ALLOC_REG_SET (&reg_obstack);
+  trs->sets = ALLOC_REG_SET (&reg_obstack);
+  return trs;
+}
+
+/* Pointer map is used to find a reg sets for insn.  */
+static struct pointer_map_t *insn_map;
+
+/* Find (or create) regset_pair for INSN in pointer_map.  */
+static regset_pair*
+regset_pair_get (rtx insn)
+{
+  void **slot = pointer_map_contains (insn_map, insn);
+  if (!slot)
+    {
+      slot = pointer_map_insert (insn_map, insn);
+      *slot = regset_pair_init ();
+    }
+  return (regset_pair*)*slot;
+}
+
+/* Callback for pointer_map_traverse to free memory used by regset_pair.  */
+static bool
+destroy_regset_pair (const void *key ATTRIBUTE_UNUSED, void **slot,
+		   void *data ATTRIBUTE_UNUSED)
+{
+  regset_pair *trs = (regset_pair*)*slot;
+  FREE_REG_SET (trs->uses);
+  FREE_REG_SET (trs->sets);
+  free (trs);
+  return true;
+}
+
+/* SMS sched_analyze hooks.  Every of them calls original hook first.  */
+static rtx curr_insn = NULL_RTX;
+
+static void
+sms_start_insn (rtx insn)
+{
+  old_sched_deps_info.start_insn (insn);
+
+  gcc_assert (insn && !curr_insn);
+  curr_insn = insn;
+  if (dump_file)
+    {
+      fprintf (dump_file, "sms analyze: start insn:\n");
+      print_rtl_single (dump_file, curr_insn);
+    }
+}
+
+static void
+sms_finish_insn (void)
+{
+  old_sched_deps_info.finish_insn ();
+
+  gcc_assert (curr_insn);
+  curr_insn = NULL_RTX;
+  if (dump_file)
+    fprintf (dump_file, "sms analyze: finished insn\n");
+}
+
+static void
+sms_note_reg_set (int regno)
+{
+  regset_pair *trs;
+  old_sched_deps_info.note_reg_set (regno);
+
+  gcc_assert (curr_insn);
+  trs = regset_pair_get (curr_insn);
+  SET_REGNO_REG_SET (trs->sets, regno);
+
+  if (dump_file)
+    fprintf (dump_file, "sms analyze: reg set %d\n", regno);
+}
+
+static void
+sms_note_reg_clobber (int regno)
+{
+  regset_pair *trs;
+  old_sched_deps_info.note_reg_clobber (regno);
+
+  gcc_assert (curr_insn);
+  trs = regset_pair_get (curr_insn);
+  SET_REGNO_REG_SET (trs->sets, regno);
+
+  if (dump_file)
+    fprintf (dump_file, "sms analyze: reg clobber %d\n", regno);
+}
+
+static void
+sms_note_reg_use (int regno)
+{
+  regset_pair *trs;
+  old_sched_deps_info.note_reg_use (regno);
+
+  gcc_assert (curr_insn);
+  trs = regset_pair_get (curr_insn);
+  SET_REGNO_REG_SET (trs->uses, regno);
+
+  if (dump_file)
+    fprintf (dump_file, "sms analyze: reg use %d\n", regno);
+}
+
+/* Extract the saved data about register usage.  Bool SETS is true,
+   when we need the set of written regs.  */
+regset
+extract_from_insn_map (rtx insn, bool sets)
+{
+  void **slot = pointer_map_contains (insn_map, insn);
+  regset_pair *trs;
+  if (!slot)
+    return NULL;
+  trs = (regset_pair*)*slot;
+  return sets ? trs->sets : trs->uses;
+}
+
+/* Setup SMS hooks.  Initialize pointer_map.  */
+void
+sms_sched_analyze_init (void)
+{
+  memcpy (&old_sched_deps_info, sched_deps_info,
+	  sizeof (struct sched_deps_info_def));
+  sched_deps_info->start_insn = sms_start_insn;
+  sched_deps_info->finish_insn = sms_finish_insn;
+  sched_deps_info->note_reg_set = sms_note_reg_set;
+  sched_deps_info->note_reg_clobber = sms_note_reg_clobber;
+  sched_deps_info->note_reg_use = sms_note_reg_use;
+  insn_map = pointer_map_create ();
+}
+
+/* Free pointer_map with freeing internal structures first.  */
+static void
+sms_create_ddg_finish (void)
+{
+  pointer_map_traverse (insn_map, destroy_regset_pair, NULL);
+  pointer_map_destroy (insn_map);
+}
+
+
+/* The following two functions are copied from the current scheduler
    code in order to use sched_analyze() for computing the dependencies.
    They are used when initializing the sched_info structure.  */
 static const char *
@@ -271,9 +428,20 @@  static struct common_sched_info_def sms_common_sched_info;
 static struct sched_deps_info_def sms_sched_deps_info =
   {
     compute_jump_reg_dependencies,
-    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-    NULL,
-    0, 0, 0
+    sms_start_insn,
+    sms_finish_insn,
+    NULL, /* start_lhs */
+    NULL, /* finish_lhs */
+    NULL, /* start_rhs */
+    NULL, /* finish_rhs */
+    sms_note_reg_set,
+    sms_note_reg_clobber,
+    sms_note_reg_use,
+    NULL, /* note_mem_dep */
+    NULL, /* note_dep */
+    0, /* use_cselib */
+    0, /* use_deps_list */
+    0 /* generate_spec_deps */
   };
 
 static struct haifa_sched_info sms_sched_info =
@@ -295,6 +463,7 @@  static struct haifa_sched_info sms_sched_info =
   0
 };
 
+
 /* Given HEAD and TAIL which are the first and last insns in a loop;
    return the register which controls the loop.  Return zero if it has
    more than one occurrence in the loop besides the control part or the
@@ -348,37 +517,164 @@  doloop_register_get (rtx head ATTRIBUTE_UNUSED, rtx tail ATTRIBUTE_UNUSED)
 #endif
 }
 
-/* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
-   that the number of iterations is a compile-time constant.  If so,
-   return the rtx that sets COUNT_REG to a constant, and set COUNT to
-   this constant.  Otherwise return 0.  */
+/* Same as previous for loop with always-the-same-step counter.  */
+static rtx
+nondoloop_register_get (rtx head, rtx tail, int cmp_side,
+			rtx *addsub_output, rtx *cmp_output)
+{
+  rtx insn, reg, flagreg, addsub, cmp, end;
+
+  /* Check jump instruction form */
+  insn = single_set (tail);
+  if (insn == NULL_RTX
+      || SET_DEST (insn) != pc_rtx
+      || GET_CODE (SET_SRC (insn)) != IF_THEN_ELSE
+      || GET_CODE (XEXP (SET_SRC (insn), 1)) != LABEL_REF
+      || XEXP (SET_SRC (insn), 2) != pc_rtx)
+    return NULL_RTX;
+
+  /* Check loop exit condition */
+  insn = XEXP (SET_SRC (insn), 0);
+  if (GET_CODE (insn) != NE || XEXP (insn, 1) != const0_rtx)
+    return NULL_RTX;
+
+  /* Flags register */
+  flagreg = XEXP (insn, 0);
+
+  /* Searching comparison instruction */
+  cmp = PREV_INSN (tail);
+  while (cmp != PREV_INSN (head))
+    {
+      if (INSN_P (cmp) && reg_set_p (flagreg, cmp))
+        break;
+      cmp = PREV_INSN (cmp);
+    }
+  if (cmp == PREV_INSN (head))
+    return NULL_RTX;
+
+  /* Check comparison */
+  insn = single_set (cmp);
+  if (insn == NULL_RTX
+      || ! rtx_equal_p (flagreg, SET_DEST (insn))
+      || GET_CODE (SET_SRC (insn)) != COMPARE)
+    return NULL_RTX;
+
+  /* Loop register */
+  gcc_assert (0 <= cmp_side && cmp_side <= 1);
+  reg = XEXP (SET_SRC (insn), cmp_side);
+  if (! REG_P (reg))
+    return NULL_RTX;
+
+  /* End value */
+  end = XEXP (SET_SRC (insn), 1 - cmp_side);
+  if (! REG_P (end) && ! CONST_INT_P (end))
+    return NULL_RTX;
+
+  /* Searching register add\sub instruction */
+  addsub = PREV_INSN (cmp);
+  while (addsub != PREV_INSN (head))
+    {
+      if (INSN_P (addsub) && reg_set_p (reg, addsub))
+        break;
+      addsub = PREV_INSN (addsub);
+    }
+  if (addsub == PREV_INSN (head))
+    return NULL_RTX;
+
+  /* Checking register change instruction */
+  insn = single_set (addsub);
+  if (insn == NULL_RTX || ! rtx_equal_p (reg, SET_DEST (insn)))
+    return NULL_RTX;
+  insn = SET_SRC (insn);
+  if ((GET_CODE (insn) != PLUS && GET_CODE (insn) != MINUS)
+      || ! rtx_equal_p (reg, XEXP (insn, 0))
+      || ! (CONST_INT_P (XEXP (insn, 1))))
+    return NULL_RTX;
+
+  /* No other REG and END (if reg) modifications allowed */
+  for (insn = head; insn != tail; insn = NEXT_INSN (insn))
+    {
+      if (REG_P(end) && reg_set_p (end, insn))
+        {
+          if (dump_file)
+          {
+            fprintf (dump_file, "SMS end register found ");
+            print_rtl_single (dump_file, reg);
+            fprintf (dump_file, " outside write in insn:\n");
+            print_rtl_single (dump_file, insn);
+          }
+	  return NULL_RTX;
+	}
+      if (insn != addsub && reg_set_p (reg, insn))
+        {
+          if (dump_file)
+          {
+            fprintf (dump_file, "SMS count_reg found ");
+            print_rtl_single (dump_file, reg);
+            fprintf (dump_file, " outside write in insn:\n");
+            print_rtl_single (dump_file, insn);
+          }
+          return NULL_RTX;
+        }
+    }
+
+  *addsub_output = addsub;
+  *cmp_output = cmp;
+  return reg;
+}
+
+/* Check if REG is set to a constant in the PRE_HEADER block.
+   If possible to find, return the rtx that sets REG.
+   If REG is set to a constant (probably not directly),
+   set IS_CONST to true and VALUE to that constant value.  */
 static rtx
-const_iteration_count (rtx count_reg, basic_block pre_header,
-		       HOST_WIDEST_INT * count)
+search_const_init (basic_block pre_header, rtx reg, bool *is_const,
+		   HOST_WIDEST_INT *value)
 {
   rtx insn;
   rtx head, tail;
 
-  if (! pre_header)
-    return NULL_RTX;
+  if (!pre_header)
+    {
+      *is_const = false;
+      return NULL_RTX;
+    }
 
   get_ebb_head_tail (pre_header, pre_header, &head, &tail);
 
   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
     if (NONDEBUG_INSN_P (insn) && single_set (insn) &&
-	rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
+	rtx_equal_p (reg, SET_DEST (single_set (insn))))
       {
-	rtx pat = single_set (insn);
+	rtx src, pat = single_set (insn);
+	src = SET_SRC (pat);
 
-	if (CONST_INT_P (SET_SRC (pat)))
+	if (CONST_INT_P (src))
 	  {
-	    *count = INTVAL (SET_SRC (pat));
-	    return insn;
+	    *is_const = true;
+	    *value = INTVAL (src);
+	  }
+	else if (REG_P (src))
+	  { /* Check if previous insn sets SRC = constant.  */
+	    pat = single_set (PREV_INSN (insn));
+	    if (pat != NULL_RTX && rtx_equal_p (src, SET_DEST (pat))
+		&& CONST_INT_P (SET_SRC (pat)))
+	      {
+		*is_const = true;
+		*value = INTVAL (SET_SRC (pat));
+	      }
+	    else
+		*is_const = false;
 	  }
+	else
+	  *is_const = false;
 
-	return NULL_RTX;
+	return insn;
       }
+    else if (reg_set_p (reg, insn))
+      break;
 
+  *is_const = false;
   return NULL_RTX;
 }
 
@@ -886,7 +1182,8 @@  clear:
 
 static void
 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
-			   int to_stage, int for_prolog, rtx count_reg)
+			   int to_stage, int for_prolog, rtx count_reg,
+			   bool doloop_p)
 {
   int row;
   ps_insn_ptr ps_ij;
@@ -898,14 +1195,14 @@  duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
 	int j, i_reg_moves;
 	rtx reg_move = NULL_RTX;
 
-        /* Do not duplicate any insn which refers to count_reg as it
-           belongs to the control part.
+        /* In doloop case do not duplicate any insn which refers
+	   to count_reg as it belongs to the control part.
            The closing branch is scheduled as well and thus should
            be ignored.
            TODO: This should be done by analyzing the control part of
            the loop.  */
-        if (reg_mentioned_p (count_reg, u_node->insn)
-            || JUMP_P (ps_ij->node->insn))
+        if ((doloop_p && reg_mentioned_p (count_reg, u_node->insn))
+            || JUMP_P (u_node->insn))
           continue;
 
 	if (for_prolog)
@@ -957,7 +1254,10 @@  duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
 static void
 generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
-                        rtx count_reg, rtx count_init)
+                        rtx count_reg, bool doloop_p, bool count_init_isconst,
+			rtx fin_reg, HOST_WIDEST_INT fin_nonconst_adjust,
+			bool create_reg, HOST_WIDEST_INT reg_val,
+			rtx *created_reg)
 {
   int i;
   int last_stage = PS_STAGE_COUNT (ps) - 1;
@@ -966,12 +1266,12 @@  generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
   start_sequence ();
 
-  if (!count_init)
+  if (doloop_p && !count_init_isconst)
     {
-      /* Generate instructions at the beginning of the prolog to
-         adjust the loop count by STAGE_COUNT.  If loop count is constant
-         (count_init), this constant is adjusted by STAGE_COUNT in
-         generate_prolog_epilog function.  */
+      /* In doloop we generate instructions at the beginning of the prolog to
+         adjust the initial value of doloop counter by STAGE_COUNT.
+	 If loop count is constant, this adjustment is done outside this
+         function, simply correcting the source of initialization insn.  */
       rtx sub_reg = NULL_RTX;
 
       sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS,
@@ -982,8 +1282,40 @@  generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
         emit_move_insn (count_reg, sub_reg);
     }
 
+  if (!doloop_p)
+    {
+      /* In non-doloop we generate instructions at the beginning of
+         the prolog to adjust the final value (with this value loop count
+	 register is compared to check whether the loop should stop).  */
+      if (fin_nonconst_adjust != 0)
+	{
+	  /* If the final value is in a register - create another register
+	     to store a shifted value.  */
+	  rtx new_reg, reg = NULL_RTX;
+	  reg = gen_reg_rtx (GET_MODE (fin_reg));
+	  new_reg = expand_simple_binop (GET_MODE (fin_reg), MINUS, fin_reg,
+					 GEN_INT (fin_nonconst_adjust),
+					 reg, 0, OPTAB_DIRECT);
+	  gcc_assert (REG_P (new_reg));
+	  if (REGNO (new_reg) != REGNO (reg))
+	    emit_move_insn (reg, new_reg);
+	  *created_reg = new_reg;
+	}
+      else if (create_reg)
+	{
+	  /* If old final value is an immediate, and the new one can't be
+	     an immediate, we create a register to store it.  If both values
+	     are immediate the adjustment is done outside this fuction,
+	     just correcting the constant value in compare intruction.  */
+	  rtx reg = NULL_RTX;
+	  reg = gen_reg_rtx (GET_MODE (count_reg));
+	  emit_move_insn (reg, GEN_INT (reg_val));
+	  *created_reg = reg;
+	}
+    }
+
   for (i = 0; i < last_stage; i++)
-    duplicate_insns_of_cycles (ps, 0, i, 1, count_reg);
+    duplicate_insns_of_cycles (ps, 0, i, 1, count_reg, doloop_p);
 
   /* Put the prolog on the entry edge.  */
   e = loop_preheader_edge (loop);
@@ -995,7 +1327,7 @@  generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
   start_sequence ();
 
   for (i = 0; i < last_stage; i++)
-    duplicate_insns_of_cycles (ps, i + 1, last_stage, 0, count_reg);
+    duplicate_insns_of_cycles (ps, i + 1, last_stage, 0, count_reg, doloop_p);
 
   /* Put the epilogue on the exit edge.  */
   gcc_assert (single_exit (loop));
@@ -1258,13 +1590,30 @@  sms_schedule (void)
           continue;
         }
 
-      /* Make sure this is a doloop.  */
-      if ( !(count_reg = doloop_register_get (head, tail)))
-      {
-        if (dump_file)
-          fprintf (dump_file, "SMS doloop_register_get failed\n");
-	continue;
-      }
+      /* Is this a doloop?  */
+      if ((count_reg = doloop_register_get (head, tail)))
+        {
+	  if (dump_file)
+	    fprintf (dump_file, "SMS doloop\n");
+        }
+      else if ((count_reg = nondoloop_register_get (head, tail, 0,
+						    &insn, &insn)))
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "SMS non-doloop\n");
+	}
+      else if ((count_reg = nondoloop_register_get (head, tail, 1,
+						    &insn, &insn)))
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "SMS non-doloop with transposed cmp\n");
+	}
+      else
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "SMS imcompatible loop\n");
+	  continue;
+	}
 
       /* Don't handle BBs with calls or barriers or auto-increment insns 
 	 (to avoid creating invalid reg-moves for the auto-increment insns),
@@ -1319,7 +1668,7 @@  sms_schedule (void)
 	    fprintf (dump_file, "SMS create_ddg failed\n");
 	  continue;
         }
-
+      sms_create_ddg_finish ();
       g_arr[loop->num] = g;
       if (dump_file)
         fprintf (dump_file, "...OK\n");
@@ -1331,15 +1680,27 @@  sms_schedule (void)
     fprintf (dump_file, "=========================\n\n");
   }
 
+  df_clear_flags (DF_LR_RUN_DCE);
+
   /* We don't want to perform SMS on new loops - created by versioning.  */
   FOR_EACH_LOOP (li, loop, 0)
     {
+      bool doloop_p, count_fin_isconst, count_init_isconst;
+      bool was_immediate = false;
+      bool prolog_create_reg = false;
+      int prolog_fin_nonconst_adjust = 0;
+      bool nonsimple_loop = false;
       rtx head, tail;
-      rtx count_reg, count_init;
-      int mii, rec_mii;
-      unsigned stage_count = 0;
-      HOST_WIDEST_INT loop_count = 0;
       bool opt_sc_p = false;
+      rtx count_reg, count_fin_reg, new_comp_reg = NULL_RTX;
+      rtx count_init_insn, count_fin_init_insn;
+      rtx add, cmp;
+      int mii, rec_mii, cmp_side = -1;
+      int stage_count = 0;
+      HOST_WIDEST_INT count_init_val = 0, count_fin_val = 0;
+      HOST_WIDEST_INT count_step = 0, loop_count = -1;
+      HOST_WIDEST_INT count_fin_newval = 0;
+      struct niter_desc *desc = NULL;
 
       if (! (g = g_arr[loop->num]))
         continue;
@@ -1377,32 +1738,159 @@  sms_schedule (void)
 	               (HOST_WIDEST_INT) profile_info->sum_max);
 	      fprintf (dump_file, "\n");
 	    }
-	  fprintf (dump_file, "SMS doloop\n");
 	  fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
           fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
           fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
 	}
 
 
-      /* In case of th loop have doloop register it gets special
-	 handling.  */
-      count_init = NULL_RTX;
-      if ((count_reg = doloop_register_get (head, tail)))
+      /* Extract count register and determine loop type.  */
+      add = NULL_RTX;
+      cmp = NULL_RTX;
+      if ((count_reg = doloop_register_get (head, tail))
+	  || (count_reg = nondoloop_register_get (head, tail, 0, &add, &cmp))
+	  || (count_reg = nondoloop_register_get (head, tail, 1, &add, &cmp)))
 	{
-	  basic_block pre_header;
+	  basic_block pre_header = loop_preheader_edge (loop)->src;
+
+	  doloop_p = (cmp == NULL_RTX);
+	  if (doloop_p)
+	    {
+	      /* Doloop finish parameters are always the same.  */
+	      count_step = -1;
+	      count_fin_isconst = true;
+	      count_fin_val = 0;
+	      count_fin_reg = NULL_RTX;
+	      count_fin_init_insn = NULL_RTX;
+	    }
+	  else
+	    {
+	      /* In other loop we need to determine counter step
+	         and finish parameters.  */
+	      rtx step, end;
+
+	      gcc_assert (single_set (add) && single_set (cmp));
+
+	      /* Extract the step.  */
+	      step = XEXP (SET_SRC (single_set (add)), 1);
+	      gcc_assert (CONST_INT_P (step));
+
+	      if (GET_CODE (SET_SRC (single_set (add))) == MINUS)
+	        count_step = - INTVAL (step);
+	      else if (GET_CODE (SET_SRC (single_set (add))) == PLUS)
+	        count_step = INTVAL (step);
+	      else
+		gcc_unreachable ();
+
+	      gcc_assert(count_step != 0);
+
+	      /* Check what operand of compare insn is a counter register.  */
+	      if (count_reg == XEXP (SET_SRC (single_set (cmp)), 0))
+		cmp_side = 0;
+	      else if (count_reg == XEXP (SET_SRC (single_set (cmp)), 1))
+		cmp_side = 1;
+	      else
+		gcc_unreachable ();
 
-	  pre_header = loop_preheader_edge (loop)->src;
-	  count_init = const_iteration_count (count_reg, pre_header,
-					      &loop_count);
+	      /* Extract finish border for counter reg.  */
+	      end = XEXP (SET_SRC (single_set (cmp)), 1 - cmp_side);
+
+	      if (CONST_INT_P (end))
+		{
+		  /* Constant finish border.  loop until (reg != const).  */
+		  count_fin_isconst = true;
+		  count_fin_val = INTVAL (end);
+		  count_fin_reg = NULL_RTX;
+		  count_fin_init_insn = NULL_RTX;
+		}
+	      else if (REG_P (end))
+		{
+		  /* Register is a border.  Loop until (reg != fin_reg).  */
+		  count_fin_reg = end;
+		  count_fin_isconst = false;
+		  /* Try to find constant initinalization of fin_reg
+		   * in preheader.  */
+		  count_fin_init_insn = search_const_init (pre_header,
+							   count_fin_reg,
+							   &count_fin_isconst,
+							   &count_fin_val);
+		}
+	      else
+		gcc_unreachable ();
+	    }
+	  /* Try to find a constant initalization of count_reg in preheader.  */
+	  count_init_insn = search_const_init (pre_header,
+					       count_reg,
+					       &count_init_isconst,
+					       &count_init_val);
+	}
+      else /* Loop is incompatible now, but it was OK on while analyzing!  */
+	gcc_assert (count_reg);
+
+
+      desc = get_simple_loop_desc (loop);
+      gcc_assert (desc);
+      /* nonsimple_loop means it's impossible to analyze the loop
+         or there are some assumptions to make the analyzis results right
+         or there is a condition of non-infinite number of iterations.
+        We want doloops to be scheduled even if analyzis shows they are
+	 nonsimple (backward compatibility).  */
+      nonsimple_loop = !desc->simple_p;
+      /* We allow scheduling loop with some assumptions or infinite condition
+	 only when unsafe_loop_optimizations flag is enabled.  */
+      if (flag_unsafe_loop_optimizations)
+	 {
+	   desc->infinite = NULL_RTX;
+	   desc->assumptions = NULL_RTX;
+	   desc->noloop_assumptions = NULL_RTX;
+	 }
+      nonsimple_loop = nonsimple_loop || (desc->assumptions != NULL_RTX)
+			|| (desc->noloop_assumptions != NULL_RTX)
+			|| (desc->infinite != NULL_RTX);
+      /* Only doloops can be nonsimple_loops for SMS.  */
+      if (nonsimple_loop && !doloop_p)
+	{
+	  free_ddg (g);
+	  continue;
+	}
+      /* Manually set some description fields in non-simple doloop.  */
+      if (nonsimple_loop)
+	{
+	  gcc_assert(doloop_p);
+	  desc->const_iter = false;
+	  desc->infinite = NULL_RTX;
 	}
-      gcc_assert (count_reg);
 
-      if (dump_file && count_init)
+      if (desc->const_iter)
+	{
+	  gcc_assert (!desc->infinite);
+	  loop_count = desc->niter;
+	  if (dump_file)
+	    fprintf (dump_file, "SMS const loop iterations = "
+		     HOST_WIDEST_INT_PRINT_DEC "\n", loop_count);
+	}
+      if (count_init_isconst && count_fin_isconst)
         {
-          fprintf (dump_file, "SMS const-doloop ");
-          fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
-		     loop_count);
-          fprintf (dump_file, "\n");
+	  gcc_assert (doloop_p || desc->const_iter);
+	  if (doloop_p)
+	    {
+	      if (nonsimple_loop)
+		{
+	          loop_count = count_init_val;
+		  desc->const_iter = true;
+		}
+              gcc_assert (desc->const_iter && loop_count == count_init_val);
+	    }
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "SMS const-%s ",
+		       doloop_p ? "doloop" : "loop");
+	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC " to "
+		       HOST_WIDEST_INT_PRINT_DEC " step "
+		       HOST_WIDEST_INT_PRINT_DEC,
+		       count_init_val, count_fin_val, count_step);
+	      fprintf (dump_file, "\n");
+	    }
         }
 
       node_order = XNEWVEC (int, g->num_nodes);
@@ -1450,8 +1938,8 @@  sms_schedule (void)
       /* The default value of PARAM_SMS_MIN_SC is 2 as stage count of
 	 1 means that there is no interleaving between iterations thus
 	 we let the scheduling passes do the job in this case.  */
-      if (stage_count < (unsigned) PARAM_VALUE (PARAM_SMS_MIN_SC)
-	  || (count_init && (loop_count <= stage_count))
+      if (stage_count < PARAM_VALUE (PARAM_SMS_MIN_SC)
+	  || (desc->const_iter && (loop_count <= stage_count))
 	  || (flag_branch_probabilities && (trip_count <= stage_count)))
 	{
 	  if (dump_file)
@@ -1467,8 +1955,10 @@  sms_schedule (void)
       else
 	{
 	  struct undo_replace_buff_elem *reg_move_replaces;
+	  int row, cmp_stage = -1;
+	  ps_insn_ptr crr_insn;
 
-          if (!opt_sc_p)
+	  if (!opt_sc_p)
             {
 	      /* Rotate the partial schedule to have the branch in row ii-1.  */
               int amount = SCHED_TIME (g->closing_branch) - (ps->ii - 1);
@@ -1489,23 +1979,24 @@  sms_schedule (void)
 	      print_partial_schedule (ps, dump_file);
 	    }
  
-          /* case the BCT count is not known , Do loop-versioning */
-	  if (count_reg && ! count_init)
-            {
-	      rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
-	  				     GEN_INT(stage_count));
-	      unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
-			       * REG_BR_PROB_BASE) / 100;
-
-	      loop_version (loop, comp_rtx, &condition_bb,
-	  		    prob, prob, REG_BR_PROB_BASE - prob,
-			    true);
-	     }
+	  if (!desc->const_iter)
+	    {
+	      /* Loop versioning if the number of iterations is unknown.  */
+	      unsigned prob;
+	      rtx vers_cond;
+	      vers_cond = gen_rtx_fmt_ee (GT, VOIDmode, nonsimple_loop ?
+					  count_reg : desc->niter_expr,
+					  GEN_INT (stage_count));
+	      if (dump_file)
+		{
+		  fprintf (dump_file, "\nLoop versioning condition:\n");
+		  print_rtl_single (dump_file, vers_cond);
+		}
 
-	  /* Set new iteration count of loop kernel.  */
-          if (count_reg && count_init)
-	    SET_SRC (single_set (count_init)) = GEN_INT (loop_count
-						     - stage_count + 1);
+	      prob = (PROB_SMS_ENOUGH_ITERATIONS * REG_BR_PROB_BASE) / 100;
+	      loop_version (loop, vers_cond, &condition_bb, prob,
+			    prob, REG_BR_PROB_BASE - prob, true);
+	    }
 
 	  /* Now apply the scheduled kernel to the RTL of the loop.  */
 	  permute_partial_schedule (ps, g->closing_branch->first_note);
@@ -1520,8 +2011,116 @@  sms_schedule (void)
 	  reg_move_replaces = generate_reg_moves (ps, true);
 	  if (dump_file)
 	    print_node_sched_params (dump_file, g->num_nodes, g);
-	  /* Generate prolog and epilog.  */
-          generate_prolog_epilog (ps, loop, count_reg, count_init);
+
+	  if (doloop_p && count_init_isconst)
+	    {
+	      /* Change counter reg initialization constant. In more complex
+	         cases this adjustment is done with adding some insns
+		 to loop prologue in generate_prolog_epilog function.  */
+	      gcc_assert (single_set (count_init_insn) != NULL_RTX);
+	      SET_SRC (single_set (count_init_insn))
+		    = GEN_INT (count_init_val - stage_count + 1);
+	      df_insn_rescan (count_init_insn);
+	    }
+
+	  if (!doloop_p)
+	    {
+	      /* Calculation of the compare insn stage in schedule.  */
+	      for (row = 0; row < ps->ii; row++)
+		for (crr_insn = ps->rows[row];
+		     crr_insn;
+		     crr_insn = crr_insn->next_in_row)
+		  {
+		    gcc_assert (0 <= SCHED_STAGE (crr_insn->node));
+		    gcc_assert (SCHED_STAGE (crr_insn->node) < stage_count);
+		    if (rtx_equal_p (crr_insn->node->insn, cmp))
+		      {
+			gcc_assert (cmp_stage == -1);
+		        cmp_stage = SCHED_STAGE (crr_insn->node);
+		      }
+		  }
+              if (dump_file)
+		fprintf (dump_file, "cmp_stage=%d\n", cmp_stage);
+	      gcc_assert (cmp_stage >= 0);
+	    }
+
+	  /* When compare insn stage is non-zero we are to shift the final
+	     counter reg value (which counter is compared to exit loop).
+	     Final value can be an immediate or can be a register, which
+	     constant initialization we find in preheader.  */
+	  was_immediate = false;
+	  if (!doloop_p && count_fin_isconst && cmp_stage > 0)
+	    {
+              gcc_assert (0 <= cmp_side && cmp_side <= 1);
+	      /* New finish value.  */
+	      count_fin_newval = count_fin_val - count_step * cmp_stage;
+	      was_immediate = CONST_INT_P (XEXP (SET_SRC (single_set (cmp)),
+							  1 - cmp_side));
+	      if (was_immediate)
+		{
+		  /* Check whether new value also can be an immediate.
+		     For exapmle, on ARM not all values can be encoded as
+		     an immediate, so we have to load it to a register once
+		     before the loop starts.  */
+		  rtx to = GEN_INT (count_fin_newval);
+		  prolog_create_reg = rtx_cost (to, GET_CODE (to), 0, false)
+				    > rtx_cost (GEN_INT(1), CONST_INT, 0, false);
+	        }
+	      else
+		{
+		  /* A value is already in a register and we easily change
+		     initialization instruction in preheader.  */
+		  gcc_assert (count_fin_init_insn);
+		  SET_SRC (single_set (count_fin_init_insn))
+			= GEN_INT (count_fin_newval);
+		  df_insn_rescan (count_fin_init_insn);
+		}
+	    }
+
+	  /* The adjustment of finish register value.
+	     Zero means no adjustment needed or adjusment is done
+	     without additional insn in prologue.  */
+	  if (!doloop_p && !count_fin_isconst)
+	    prolog_fin_nonconst_adjust = count_step * cmp_stage;
+
+	  /* Ready to generate prolog and epilog.  */
+	  generate_prolog_epilog (ps, loop, count_reg, doloop_p,
+			          count_init_isconst, count_fin_reg,
+				  prolog_fin_nonconst_adjust,
+				  prolog_create_reg, count_fin_newval,
+				  &new_comp_reg);
+
+	  /* And only after generating prolog and epilog it is possible
+	     to modify the compare instruction (to prevent copying wrong insn
+	     form to first and last stages).  */
+	  if (!doloop_p && cmp_stage > 0)
+	    {
+              gcc_assert (0 <= cmp_side && cmp_side <= 1);
+	      if (was_immediate && !prolog_create_reg)
+		{
+		/* Easy case - just modify a constant.  */
+		  gcc_assert (new_comp_reg == NULL_RTX);
+		  XEXP (SET_SRC (single_set (cmp)), 1 - cmp_side)
+			= GEN_INT (count_fin_newval);
+		}
+	      else
+		{
+		  if (count_fin_isconst && !was_immediate)
+		    /* Value is in a register and we already changed
+		       initialization instruction in preheader.  */
+		    gcc_assert (new_comp_reg == NULL_RTX);
+		  else
+		    {
+		      /* Another case - use created by generate_prolog_epilog
+		         register, which value is initialized in prologue.  */
+		      gcc_assert (new_comp_reg != NULL_RTX);
+		      XEXP (SET_SRC (single_set (cmp)), 1 - cmp_side)
+			      = new_comp_reg;
+		    }
+		}
+	      df_insn_rescan (cmp);
+	    }
+	  else gcc_assert (new_comp_reg == NULL_RTX);
 
 	  free_undo_replace_buff (reg_move_replaces);
 	}
@@ -1532,7 +2131,9 @@  sms_schedule (void)
       free_ddg (g);
     }
 
+  df_set_flags (DF_LR_RUN_DCE);
   free (g_arr);
+  iv_analysis_done ();
 
   /* Release scheduler data, needed until now because of DFA.  */
   haifa_sched_finish ();
diff --git a/gcc/sched-int.h b/gcc/sched-int.h
index 6797397..4a1d726 100644
--- a/gcc/sched-int.h
+++ b/gcc/sched-int.h
@@ -1232,6 +1232,8 @@  extern void sched_deps_finish (void);
 extern void haifa_note_reg_set (int);
 extern void haifa_note_reg_clobber (int);
 extern void haifa_note_reg_use (int);
+void sms_sched_analyze_init (void);
+regset extract_from_insn_map (rtx, bool);
 
 extern void maybe_extend_reg_info_p (void);