diff mbox

[SMS] Support new loop pattern

Message ID CAJnFk2dZw4duMwcC2jo-2onpecH=FTeXJ1-fdqSAJkMmDk7Oeg@mail.gmail.com
State New
Headers show

Commit Message

Roman Zhuykov Dec. 7, 2011, 2:36 p.m. UTC
2011/10/12 Ayal Zaks <ayal.zaks@gmail.com>:>>> - the last jump
instruction should look like:  pc=(regF!=0)?label:pc, regF is>> you'd
probably want to bump to next instruction if falling through,> e.g.,
pc=(regF!=0)?label:pc+4>
It is considered that program counter is increased automatically
onhardware level.Otherwise we should add something like "pc=pc+4" in
parallel to eachinstruction in RTL.
>>>  flag register;>>> - the last instruction which sets regF should be: regF=COMPARE(regC,X), where X>>>  is a constant, or maybe a register, which is not changed inside a loop;>>> - only one instruction modifies regC inside a loop (other can use regC, but not>>>  write), and it should simply adjust it by a constant: regC=regC+step, where>>>  step is a constant.>>>>> When doloop is succesfully scheduled by SMS, its number of>>> iterations of loop kernel should be decreased by the number of stages in a>>> schedule minus one, while other iterations expand to prologue and epilogue.>>> In new supported loops such approach can't be used, because some>>> instructions can use count register (regC).  Instead of this,>>> the final register value X in compare instruction regF=COMPARE(regC,X)>>> is changed to another value Y respective to the stage this instruction>>> is scheduled (Y = X - stage * step).>> making sure this does not underflow; i.e., that the number of> iterations is no less than stage (you've addressed this towards the> end below).>
Yes, this situation is processed correctly.
>>>> The main difference from doloop case is that regC can be used by some>> instructions in loop body.>> That's why we are unable to simply adjust regC initial value, but have>> to keep it's value correct on each particular iteration.>> So, we change comparison instruction accordingly.>>>> An example:>> int a[100];>> int main()>> {>>  int i;>>  for (i = 85; i > 12; i -= 5)>>      a[i] = i * i;>>  return a[15]-225;>> }>> ARM assembler with "-O2 -fno-auto-inc-dec":>>        ldr     r0, .L5>>        mov     r3, #85>>        mov     r2, r0>> .L2:>>        mul     r1, r3, r3>>        sub     r3, r3, #5>>        cmp     r3, #10>>        str     r1, [r2, #340]>>        sub     r2, r2, #20>>        bne     .L2>>        ldr     r0, [r0, #60]>>        sub     r0, r0, #225>>        bx      lr>> .L5:>>        .word   a>>>> Loop body is executed 15 times.>> When compiling with SMS, it finds a schedule with ii=7, stage_count=3>> and following times:>> Stage  Time       Insn>> 0          5      mul     r1, r3, r3>> 1         10     sub     r3, r3, #5>> 1         11     cmp     r3, #10>> 1         11     str     r1, [r2, #340]>> 1         13     bne     .L2>> 2         16     sub     r2, r2, #20>>>> branch is not scheduled last?>
Yes, branch schedule time is smaller then sub's one.This mean that
"sub r2, r2, $20" instruction from original iterationnumber K will be
executed after"bne .L2" from original iteration number K.But certainly
bne remains to be the last instuction in new loop body.Below you can
see how it looks after SMS.
>> To make new schedule correct the loop body>> should be executed 14 times and we change compare instruction:>> the loop itself should execute 13 times.
with i =85, 80, 75, 70, 6560, 55, 50, 45, 4035, 30, 25, 20, 15this
gives total 15 iterations (15 stores to memory).And new loop body will
be executed 13 times (one store goes toepilogue and one - to
prologue).
>> regF=COMPARE(regC,X) to regF=COMPARE(regC,Y) where Y = X - stage * step.>> In our example regC is r3, X is 10, step = -5, compare instruction>> is scheduled on stage 1, so it should be Y = 10 - 1 * (-5) = 15.>>>> right. In general, if the compare is on stage s (starting from 0), it> will be executed s times in the epilog, so it should exit the loop> upon reaching Y = X - s * step.>>> So, after SMS it looks like:>>        ldr     r0, .L5>>        mov     r3, #85>>        mov     r2, r0>> ;;prologue>>        mul     r1, r3, r3      ;;from stage 0 first iteration>>        sub     r3, r3, #5      ;;3 insns from stage 1 first iteration>>        cmp     r3, #10>>        str     r1, [r2, #340]>>        mul     r1, r3, r3      ;;from stage 0 second iteration>> ;;body>> .L2:>>        sub     r3, r3, #5>>        sub     r2, r2, #20>>        cmp     r3, #15         ;; new value to compare with is Y=15>>        str     r1, [r2, #340]>>        mul     r1, r3, r3>>        bne     .L2>> ;;epilogue>>        sub     r2, r2, #20     ;;from stage 2 pre-last iteration>>        sub     r3, r3, #5      ;;3 insns from stage 1 last iteration>>        cmp     r3, #10>>        str     r1, [r2, #340]>>        sub     r2, r2, #20     ;;from stage 2 last iteration>>>>        ldr     r0, [r0, #60]>>        sub     r0, r0, #225>>        bx      lr>> .L5:>>        .word   a>>
Here in comments I mention why insn was copied to prolog and
epilog.Only branch is not copied at all.
>>> Testing of this appoach reveals two bugs, which do not appear while SMS was>>> used only for doloop loops.  Both these bugs happen due to the nature of the>>> flag register.  On x86_64 it is clobbered by most of arithmetic instructions.> This should ideally be solved by a dedicated (separate) patch.> ...> This too should be solved by a dedicated (separate) patch, for easier digestion.
As Ayal asks, I'll continue discussion of these two bugs in
twoseparate e-mails, answering on this letter.
>>>>>> One more thing to point out is number of loop iterations. When number of>>> iterations of a loop is not known at compile time, SMS has to create two loop>>> versions (original and scheduled), and execute scheduled one only when real>>> number of iterations is bigger than number of stages.  In doloop case the>>> number of iterations simply equals to the count register value before the loop.>>> So SMS finds its constant initialization or makes two loop versions.  In new>>> supported loops number of iterations value is more complex.  It even can't be>>> calculated as (final_reg_value-start_reg_value)/step because of examples like>>> this:>>>>>> for (unsigned int x = 0x0; x != 0x6F80919A; x += 0xEDCBA987) ...;>>>>>> This loop has 22 iterations.  So, i decided to use get_simple_loop_desc>>> function which gives a structure with loop characteristics, some of them helps>>> to find iteration number:>>>>>> rtx niter_expr - The number of iterations of the loop;>>> bool const_iter - True if the loop iterates the constant number of times;>>> unsigned HOST_WIDEST_INT niter - Number of iterations if constant;>>>>>> But we can use these expressions only after looking through some other fields>>> of returned structure:>>>>>> bool simple_p - True if we are able to say anything about number of iterations>>> of the loop;>>> rtx assumptions - Assumptions under that the rest of the information is valid;>>> rtx noloop_assumptions - Assumptions under which the loop ends before reaching>>> the latch;>>> rtx infinite - Condition under which the loop is infinite.>>>>>> I decide to allow SMS scheduling only when simple_p is true and other three>>> fields are NULL_RTX, or when simple_p is true and>>> flag_unsafe_loop_optimizations is set.  One more exception is infinite>>> condition, and the next separate patch is an attempt to process it.>>>>> ok, still need to go over this rather lengthy and orthogonal (although> it exposes the bugs above) piece.>> Ayal.>>
New version is attached, it suits current trunk.Without fixing both
bugs mentioned above, this patch brokes bootstrap on x86-64.
Together with DDG fixes the patch was succesfully regtested on ARM,and
"regstrapped" on x86-64 and IA64.
--Roman Zhuykovzhroma@ispras.ru
diff mbox

Patch

2011-12-07  Roman Zhuykov  <zhroma@ispras.ru>
	* modulo-sched.c (nondoloop_register_get): New function.
	(const_iteration_count): Rename to ...
	(search_const_init): ...this.  Add new parameter (is_const).  Always
	return register initialization rtx and set is_const to true
	only when it is constant.
	(duplicate_insns_of_cycles): Add new parameter (doloop_p).  Do not
	duplicate instructions with count_reg only when doloop_p is set.
	Update all callers.
	(generate_prolog_epilog): Add new parameters.  Correctly generate loop
	prologue for new loop pattern.
	(sms_schedule): Support new loop pattern.
---

diff --git a/gcc/modulo-sched.c b/gcc/modulo-sched.c
index 0ea9a4d..e62aca7 100644
--- a/gcc/modulo-sched.c
+++ b/gcc/modulo-sched.c
@@ -220,7 +220,8 @@  static void set_node_sched_params (ddg_ptr);
 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
 static void permute_partial_schedule (partial_schedule_ptr, rtx);
 static void generate_prolog_epilog (partial_schedule_ptr, struct loop *,
-                                    rtx, rtx);
+                                    rtx, bool, bool, rtx, HOST_WIDEST_INT,
+                                    bool, HOST_WIDEST_INT, rtx *);
 static int calculate_stage_count (partial_schedule_ptr, int);
 static void calculate_must_precede_follow (ddg_node_ptr, int, int,
 					   int, int, sbitmap, sbitmap, sbitmap);
@@ -255,7 +256,7 @@  typedef struct node_sched_params node_sched_params;
 DEF_VEC_O (node_sched_params);
 DEF_VEC_ALLOC_O (node_sched_params, heap);
 
-/* The following three functions are copied from the current scheduler
+/* The following two functions are copied from the current scheduler
    code in order to use sched_analyze() for computing the dependencies.
    They are used when initializing the sched_info structure.  */
 static const char *
@@ -398,37 +399,164 @@  doloop_register_get (rtx head ATTRIBUTE_UNUSED, rtx tail ATTRIBUTE_UNUSED)
 #endif
 }
 
-/* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
-   that the number of iterations is a compile-time constant.  If so,
-   return the rtx that sets COUNT_REG to a constant, and set COUNT to
-   this constant.  Otherwise return 0.  */
+/* Same as previous for loop with always-the-same-step counter.  */
 static rtx
-const_iteration_count (rtx count_reg, basic_block pre_header,
-		       HOST_WIDEST_INT * count)
+nondoloop_register_get (rtx head, rtx tail, int cmp_side,
+			rtx *addsub_output, rtx *cmp_output)
+{
+  rtx insn, reg, flagreg, addsub, cmp, end;
+
+  /* Check jump instruction form */
+  insn = single_set (tail);
+  if (insn == NULL_RTX
+      || SET_DEST (insn) != pc_rtx
+      || GET_CODE (SET_SRC (insn)) != IF_THEN_ELSE
+      || GET_CODE (XEXP (SET_SRC (insn), 1)) != LABEL_REF
+      || XEXP (SET_SRC (insn), 2) != pc_rtx)
+    return NULL_RTX;
+
+  /* Check loop exit condition */
+  insn = XEXP (SET_SRC (insn), 0);
+  if (GET_CODE (insn) != NE || XEXP (insn, 1) != const0_rtx)
+    return NULL_RTX;
+
+  /* Flags register */
+  flagreg = XEXP (insn, 0);
+
+  /* Searching comparison instruction */
+  cmp = PREV_INSN (tail);
+  while (cmp != PREV_INSN (head))
+    {
+      if (INSN_P (cmp) && reg_set_p (flagreg, cmp))
+        break;
+      cmp = PREV_INSN (cmp);
+    }
+  if (cmp == PREV_INSN (head))
+    return NULL_RTX;
+
+  /* Check comparison */
+  insn = single_set (cmp);
+  if (insn == NULL_RTX
+      || ! rtx_equal_p (flagreg, SET_DEST (insn))
+      || GET_CODE (SET_SRC (insn)) != COMPARE)
+    return NULL_RTX;
+
+  /* Loop register */
+  gcc_assert (0 <= cmp_side && cmp_side <= 1);
+  reg = XEXP (SET_SRC (insn), cmp_side);
+  if (! REG_P (reg))
+    return NULL_RTX;
+
+  /* End value */
+  end = XEXP (SET_SRC (insn), 1 - cmp_side);
+  if (! REG_P (end) && ! CONST_INT_P (end))
+    return NULL_RTX;
+
+  /* Searching register add\sub instruction */
+  addsub = PREV_INSN (cmp);
+  while (addsub != PREV_INSN (head))
+    {
+      if (INSN_P (addsub) && reg_set_p (reg, addsub))
+        break;
+      addsub = PREV_INSN (addsub);
+    }
+  if (addsub == PREV_INSN (head))
+    return NULL_RTX;
+
+  /* Checking register change instruction */
+  insn = single_set (addsub);
+  if (insn == NULL_RTX || ! rtx_equal_p (reg, SET_DEST (insn)))
+    return NULL_RTX;
+  insn = SET_SRC (insn);
+  if ((GET_CODE (insn) != PLUS && GET_CODE (insn) != MINUS)
+      || ! rtx_equal_p (reg, XEXP (insn, 0))
+      || ! (CONST_INT_P (XEXP (insn, 1))))
+    return NULL_RTX;
+
+  /* No other REG and END (if reg) modifications allowed */
+  for (insn = head; insn != tail; insn = NEXT_INSN (insn))
+    {
+      if (REG_P(end) && reg_set_p (end, insn))
+        {
+          if (dump_file)
+          {
+            fprintf (dump_file, "SMS end register found ");
+            print_rtl_single (dump_file, reg);
+            fprintf (dump_file, " outside write in insn:\n");
+            print_rtl_single (dump_file, insn);
+          }
+	  return NULL_RTX;
+	}
+      if (insn != addsub && reg_set_p (reg, insn))
+        {
+          if (dump_file)
+          {
+            fprintf (dump_file, "SMS count_reg found ");
+            print_rtl_single (dump_file, reg);
+            fprintf (dump_file, " outside write in insn:\n");
+            print_rtl_single (dump_file, insn);
+          }
+          return NULL_RTX;
+        }
+    }
+
+  *addsub_output = addsub;
+  *cmp_output = cmp;
+  return reg;
+}
+
+/* Check if REG is set to a constant in the PRE_HEADER block.
+   If possible to find, return the rtx that sets REG.
+   If REG is set to a constant (probably not directly),
+   set IS_CONST to true and VALUE to that constant value.  */
+static rtx
+search_const_init (basic_block pre_header, rtx reg, bool *is_const,
+		   HOST_WIDEST_INT *value)
 {
   rtx insn;
   rtx head, tail;
 
-  if (! pre_header)
-    return NULL_RTX;
+  if (!pre_header)
+    {
+      *is_const = false;
+      return NULL_RTX;
+    }
 
   get_ebb_head_tail (pre_header, pre_header, &head, &tail);
 
   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
     if (NONDEBUG_INSN_P (insn) && single_set (insn) &&
-	rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
+	rtx_equal_p (reg, SET_DEST (single_set (insn))))
       {
-	rtx pat = single_set (insn);
+	rtx src, pat = single_set (insn);
+	src = SET_SRC (pat);
 
-	if (CONST_INT_P (SET_SRC (pat)))
+	if (CONST_INT_P (src))
 	  {
-	    *count = INTVAL (SET_SRC (pat));
-	    return insn;
+	    *is_const = true;
+	    *value = INTVAL (src);
+	  }
+	else if (REG_P (src))
+	  { /* Check if previous insn sets SRC = constant.  */
+	    pat = single_set (PREV_INSN (insn));
+	    if (pat != NULL_RTX && rtx_equal_p (src, SET_DEST (pat))
+		&& CONST_INT_P (SET_SRC (pat)))
+	      {
+		*is_const = true;
+		*value = INTVAL (SET_SRC (pat));
+	      }
+	    else
+		*is_const = false;
 	  }
+	else
+	  *is_const = false;
 
-	return NULL_RTX;
+	return insn;
       }
+    else if (reg_set_p (reg, insn))
+      break;
 
+  *is_const = false;
   return NULL_RTX;
 }
 
@@ -1103,7 +1231,7 @@  clear:
 
 static void
 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
-			   int to_stage, rtx count_reg)
+			   int to_stage, rtx count_reg, bool doloop_p)
 {
   int row;
   ps_insn_ptr ps_ij;
@@ -1115,14 +1243,14 @@  duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
 	int first_u, last_u;
 	rtx u_insn;
 
-        /* Do not duplicate any insn which refers to count_reg as it
-           belongs to the control part.
+        /* In doloop case do not duplicate any insn which refers
+	   to count_reg as it belongs to the control part.
            The closing branch is scheduled as well and thus should
            be ignored.
            TODO: This should be done by analyzing the control part of
            the loop.  */
 	u_insn = ps_rtl_insn (ps, u);
-        if (reg_mentioned_p (count_reg, u_insn)
+        if ((doloop_p && reg_mentioned_p (count_reg, u_insn))
             || JUMP_P (u_insn))
           continue;
 
@@ -1142,7 +1270,10 @@  duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
 static void
 generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
-                        rtx count_reg, rtx count_init)
+                        rtx count_reg, bool doloop_p, bool count_init_isconst,
+			rtx fin_reg, HOST_WIDEST_INT fin_nonconst_adjust,
+			bool create_reg, HOST_WIDEST_INT reg_val,
+			rtx *created_reg)
 {
   int i;
   int last_stage = PS_STAGE_COUNT (ps) - 1;
@@ -1151,12 +1282,12 @@  generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
   start_sequence ();
 
-  if (!count_init)
+  if (doloop_p && !count_init_isconst)
     {
-      /* Generate instructions at the beginning of the prolog to
-         adjust the loop count by STAGE_COUNT.  If loop count is constant
-         (count_init), this constant is adjusted by STAGE_COUNT in
-         generate_prolog_epilog function.  */
+      /* In doloop we generate instructions at the beginning of the prolog to
+         adjust the initial value of doloop counter by STAGE_COUNT.
+	 If loop count is constant, this adjustment is done outside this
+         function, simply correcting the source of initialization insn.  */
       rtx sub_reg = NULL_RTX;
 
       sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS,
@@ -1167,8 +1298,40 @@  generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
         emit_move_insn (count_reg, sub_reg);
     }
 
+  if (!doloop_p)
+    {
+      /* In non-doloop we generate instructions at the beginning of
+         the prolog to adjust the final value (with this value loop count
+	 register is compared to check whether the loop should stop).  */
+      if (fin_nonconst_adjust != 0)
+	{
+	  /* If the final value is in a register - create another register
+	     to store a shifted value.  */
+	  rtx new_reg, reg = NULL_RTX;
+	  reg = gen_reg_rtx (GET_MODE (fin_reg));
+	  new_reg = expand_simple_binop (GET_MODE (fin_reg), MINUS, fin_reg,
+					 GEN_INT (fin_nonconst_adjust),
+					 reg, 0, OPTAB_DIRECT);
+	  gcc_assert (REG_P (new_reg));
+	  if (REGNO (new_reg) != REGNO (reg))
+	    emit_move_insn (reg, new_reg);
+	  *created_reg = new_reg;
+	}
+      else if (create_reg)
+	{
+	  /* If old final value is an immediate, and the new one can't be
+	     an immediate, we create a register to store it.  If both values
+	     are immediate the adjustment is done outside this fuction,
+	     just correcting the constant value in compare intruction.  */
+	  rtx reg = NULL_RTX;
+	  reg = gen_reg_rtx (GET_MODE (count_reg));
+	  emit_move_insn (reg, GEN_INT (reg_val));
+	  *created_reg = reg;
+	}
+    }
+
   for (i = 0; i < last_stage; i++)
-    duplicate_insns_of_cycles (ps, 0, i, count_reg);
+    duplicate_insns_of_cycles (ps, 0, i, count_reg, doloop_p);
 
   /* Put the prolog on the entry edge.  */
   e = loop_preheader_edge (loop);
@@ -1182,7 +1345,7 @@  generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
   start_sequence ();
 
   for (i = 0; i < last_stage; i++)
-    duplicate_insns_of_cycles (ps, i + 1, last_stage, count_reg);
+    duplicate_insns_of_cycles (ps, i + 1, last_stage, count_reg, doloop_p);
 
   /* Put the epilogue on the exit edge.  */
   gcc_assert (single_exit (loop));
@@ -1460,13 +1623,30 @@  sms_schedule (void)
           continue;
         }
 
-      /* Make sure this is a doloop.  */
-      if ( !(count_reg = doloop_register_get (head, tail)))
-      {
-        if (dump_file)
-          fprintf (dump_file, "SMS doloop_register_get failed\n");
-	continue;
-      }
+      /* Is this a doloop?  */
+      if ((count_reg = doloop_register_get (head, tail)))
+        {
+	  if (dump_file)
+	    fprintf (dump_file, "SMS doloop\n");
+        }
+      else if ((count_reg = nondoloop_register_get (head, tail, 0,
+						    &insn, &insn)))
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "SMS non-doloop\n");
+	}
+      else if ((count_reg = nondoloop_register_get (head, tail, 1,
+						    &insn, &insn)))
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "SMS non-doloop with transposed cmp\n");
+	}
+      else
+	{
+	  if (dump_file)
+	    fprintf (dump_file, "SMS imcompatible loop\n");
+	  continue;
+	}
 
       /* Don't handle BBs with calls or barriers
 	 or !single_set with the exception of instructions that include
@@ -1516,7 +1696,6 @@  sms_schedule (void)
 	    fprintf (dump_file, "SMS create_ddg failed\n");
 	  continue;
         }
-
       g_arr[loop->num] = g;
       if (dump_file)
         fprintf (dump_file, "...OK\n");
@@ -1528,14 +1707,28 @@  sms_schedule (void)
     fprintf (dump_file, "=========================\n\n");
   }
 
+  df_clear_flags (DF_LR_RUN_DCE);
+
   /* We don't want to perform SMS on new loops - created by versioning.  */
   FOR_EACH_LOOP (li, loop, 0)
     {
+      bool doloop_p, count_fin_isconst, count_init_isconst;
+      bool was_immediate = false;
+      bool prolog_create_reg = false;
+      int prolog_fin_nonconst_adjust = 0;
+      bool nonsimple_loop = false;
       rtx head, tail;
-      rtx count_reg, count_init;
-      int mii, rec_mii, stage_count, min_cycle;
-      HOST_WIDEST_INT loop_count = 0;
+      int min_cycle;
       bool opt_sc_p;
+      rtx count_reg, count_fin_reg, new_comp_reg = NULL_RTX;
+      rtx count_init_insn, count_fin_init_insn;
+      rtx add, cmp;
+      int mii, rec_mii, cmp_side = -1, cmp_stage = -1;
+      int stage_count = 0;
+      HOST_WIDEST_INT count_init_val = 0, count_fin_val = 0;
+      HOST_WIDEST_INT count_step = 0, loop_count = -1;
+      HOST_WIDEST_INT count_fin_newval = 0;
+      struct niter_desc *desc = NULL;
 
       if (! (g = g_arr[loop->num]))
         continue;
@@ -1573,32 +1766,159 @@  sms_schedule (void)
 	               (HOST_WIDEST_INT) profile_info->sum_max);
 	      fprintf (dump_file, "\n");
 	    }
-	  fprintf (dump_file, "SMS doloop\n");
 	  fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
           fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
           fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
 	}
 
 
-      /* In case of th loop have doloop register it gets special
-	 handling.  */
-      count_init = NULL_RTX;
-      if ((count_reg = doloop_register_get (head, tail)))
+      /* Extract count register and determine loop type.  */
+      add = NULL_RTX;
+      cmp = NULL_RTX;
+      if ((count_reg = doloop_register_get (head, tail))
+	  || (count_reg = nondoloop_register_get (head, tail, 0, &add, &cmp))
+	  || (count_reg = nondoloop_register_get (head, tail, 1, &add, &cmp)))
 	{
-	  basic_block pre_header;
+	  basic_block pre_header = loop_preheader_edge (loop)->src;
+
+	  doloop_p = (cmp == NULL_RTX);
+	  if (doloop_p)
+	    {
+	      /* Doloop finish parameters are always the same.  */
+	      count_step = -1;
+	      count_fin_isconst = true;
+	      count_fin_val = 0;
+	      count_fin_reg = NULL_RTX;
+	      count_fin_init_insn = NULL_RTX;
+	    }
+	  else
+	    {
+	      /* In other loop we need to determine counter step
+	         and finish parameters.  */
+	      rtx step, end;
+
+	      gcc_assert (single_set (add) && single_set (cmp));
+
+	      /* Extract the step.  */
+	      step = XEXP (SET_SRC (single_set (add)), 1);
+	      gcc_assert (CONST_INT_P (step));
+
+	      if (GET_CODE (SET_SRC (single_set (add))) == MINUS)
+	        count_step = - INTVAL (step);
+	      else if (GET_CODE (SET_SRC (single_set (add))) == PLUS)
+	        count_step = INTVAL (step);
+	      else
+		gcc_unreachable ();
+
+	      gcc_assert(count_step != 0);
+
+	      /* Check what operand of compare insn is a counter register.  */
+	      if (count_reg == XEXP (SET_SRC (single_set (cmp)), 0))
+		cmp_side = 0;
+	      else if (count_reg == XEXP (SET_SRC (single_set (cmp)), 1))
+		cmp_side = 1;
+	      else
+		gcc_unreachable ();
+
+	      /* Extract finish border for counter reg.  */
+	      end = XEXP (SET_SRC (single_set (cmp)), 1 - cmp_side);
 
-	  pre_header = loop_preheader_edge (loop)->src;
-	  count_init = const_iteration_count (count_reg, pre_header,
-					      &loop_count);
+	      if (CONST_INT_P (end))
+		{
+		  /* Constant finish border.  loop until (reg != const).  */
+		  count_fin_isconst = true;
+		  count_fin_val = INTVAL (end);
+		  count_fin_reg = NULL_RTX;
+		  count_fin_init_insn = NULL_RTX;
+		}
+	      else if (REG_P (end))
+		{
+		  /* Register is a border.  Loop until (reg != fin_reg).  */
+		  count_fin_reg = end;
+		  count_fin_isconst = false;
+		  /* Try to find constant initinalization of fin_reg
+		   * in preheader.  */
+		  count_fin_init_insn = search_const_init (pre_header,
+							   count_fin_reg,
+							   &count_fin_isconst,
+							   &count_fin_val);
+		}
+	      else
+		gcc_unreachable ();
+	    }
+	  /* Try to find a constant initalization of count_reg in preheader.  */
+	  count_init_insn = search_const_init (pre_header,
+					       count_reg,
+					       &count_init_isconst,
+					       &count_init_val);
+	}
+      else /* Loop is incompatible now, but it was OK on while analyzing!  */
+	gcc_assert (count_reg);
+
+
+      desc = get_simple_loop_desc (loop);
+      gcc_assert (desc);
+      /* nonsimple_loop means it's impossible to analyze the loop
+         or there are some assumptions to make the analyzis results right
+         or there is a condition of non-infinite number of iterations.
+        We want doloops to be scheduled even if analyzis shows they are
+	 nonsimple (backward compatibility).  */
+      nonsimple_loop = !desc->simple_p;
+      /* We allow scheduling loop with some assumptions or infinite condition
+	 only when unsafe_loop_optimizations flag is enabled.  */
+      if (flag_unsafe_loop_optimizations)
+	 {
+	   desc->infinite = NULL_RTX;
+	   desc->assumptions = NULL_RTX;
+	   desc->noloop_assumptions = NULL_RTX;
+	 }
+      nonsimple_loop = nonsimple_loop || (desc->assumptions != NULL_RTX)
+			|| (desc->noloop_assumptions != NULL_RTX)
+			|| (desc->infinite != NULL_RTX);
+      /* Only doloops can be nonsimple_loops for SMS.  */
+      if (nonsimple_loop && !doloop_p)
+	{
+	  free_ddg (g);
+	  continue;
+	}
+      /* Manually set some description fields in non-simple doloop.  */
+      if (nonsimple_loop)
+	{
+	  gcc_assert(doloop_p);
+	  desc->const_iter = false;
+	  desc->infinite = NULL_RTX;
 	}
-      gcc_assert (count_reg);
 
-      if (dump_file && count_init)
+      if (desc->const_iter)
+	{
+	  gcc_assert (!desc->infinite);
+	  loop_count = desc->niter;
+	  if (dump_file)
+	    fprintf (dump_file, "SMS const loop iterations = "
+		     HOST_WIDEST_INT_PRINT_DEC "\n", loop_count);
+	}
+      if (count_init_isconst && count_fin_isconst)
         {
-          fprintf (dump_file, "SMS const-doloop ");
-          fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
-		     loop_count);
-          fprintf (dump_file, "\n");
+	  gcc_assert (doloop_p || desc->const_iter);
+	  if (doloop_p)
+	    {
+	      if (nonsimple_loop)
+		{
+	          loop_count = count_init_val;
+		  desc->const_iter = true;
+		}
+              gcc_assert (desc->const_iter && loop_count == count_init_val);
+	    }
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "SMS const-%s ",
+		       doloop_p ? "doloop" : "loop");
+	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC " to "
+		       HOST_WIDEST_INT_PRINT_DEC " step "
+		       HOST_WIDEST_INT_PRINT_DEC,
+		       count_init_val, count_fin_val, count_step);
+	      fprintf (dump_file, "\n");
+	    }
         }
 
       node_order = XNEWVEC (int, g->num_nodes);
@@ -1649,7 +1969,7 @@  sms_schedule (void)
 	     1 means that there is no interleaving between iterations thus
 	     we let the scheduling passes do the job in this case.  */
 	  if (stage_count < PARAM_VALUE (PARAM_SMS_MIN_SC)
-	      || (count_init && (loop_count <= stage_count))
+	      || (desc->const_iter && (loop_count <= stage_count))
 	      || (flag_branch_probabilities && (trip_count <= stage_count)))
 	    {
 	      if (dump_file)
@@ -1709,23 +2029,24 @@  sms_schedule (void)
 	      print_partial_schedule (ps, dump_file);
 	    }
  
-          /* case the BCT count is not known , Do loop-versioning */
-	  if (count_reg && ! count_init)
-            {
-	      rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
-	  				     GEN_INT(stage_count));
-	      unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
-			       * REG_BR_PROB_BASE) / 100;
-
-	      loop_version (loop, comp_rtx, &condition_bb,
-	  		    prob, prob, REG_BR_PROB_BASE - prob,
-			    true);
-	     }
+	  if (!desc->const_iter)
+	    {
+	      /* Loop versioning if the number of iterations is unknown.  */
+	      unsigned prob;
+	      rtx vers_cond;
+	      vers_cond = gen_rtx_fmt_ee (GT, VOIDmode, nonsimple_loop ?
+					  count_reg : desc->niter_expr,
+					  GEN_INT (stage_count));
+	      if (dump_file)
+		{
+		  fprintf (dump_file, "\nLoop versioning condition:\n");
+		  print_rtl_single (dump_file, vers_cond);
+		}
 
-	  /* Set new iteration count of loop kernel.  */
-          if (count_reg && count_init)
-	    SET_SRC (single_set (count_init)) = GEN_INT (loop_count
-						     - stage_count + 1);
+	      prob = (PROB_SMS_ENOUGH_ITERATIONS * REG_BR_PROB_BASE) / 100;
+	      loop_version (loop, vers_cond, &condition_bb, prob,
+			    prob, REG_BR_PROB_BASE - prob, true);
+	    }
 
 	  /* Now apply the scheduled kernel to the RTL of the loop.  */
 	  permute_partial_schedule (ps, g->closing_branch->first_note);
@@ -1741,8 +2062,121 @@  sms_schedule (void)
 	  apply_reg_moves (ps);
 	  if (dump_file)
 	    print_node_sched_params (dump_file, g->num_nodes, ps);
-	  /* Generate prolog and epilog.  */
-          generate_prolog_epilog (ps, loop, count_reg, count_init);
+
+	  if (doloop_p && count_init_isconst)
+	    {
+	      /* Change counter reg initialization constant. In more complex
+	         cases this adjustment is done with adding some insns
+		 to loop prologue in generate_prolog_epilog function.  */
+	      gcc_assert (single_set (count_init_insn) != NULL_RTX);
+	      SET_SRC (single_set (count_init_insn))
+		    = GEN_INT (count_init_val - stage_count + 1);
+	      df_insn_rescan (count_init_insn);
+	    }
+
+	  if (!doloop_p)
+	    {
+	      /* Calculation of the compare insn stage in schedule.  */
+	      ps_insn_ptr crr_insn;
+	      int row, stage;
+	      cmp_stage = -1;
+	      for (row = 0; row < ps->ii; row++)
+		for (crr_insn = ps->rows[row];
+		     crr_insn;
+		     crr_insn = crr_insn->next_in_row)
+		  {
+		    stage = SCHED_STAGE (crr_insn->id);
+		    gcc_assert (0 <= stage && stage < stage_count);
+		    if (rtx_equal_p (ps_rtl_insn (ps, crr_insn->id), cmp))
+		      {
+			gcc_assert (cmp_stage == -1);
+		        cmp_stage = stage;
+		      }
+		  }
+              if (dump_file)
+		fprintf (dump_file, "cmp_stage=%d\n", cmp_stage);
+	      gcc_assert (cmp_stage >= 0);
+	    }
+
+	  /* When compare insn stage is non-zero we are to shift the final
+	     counter reg value (which counter is compared to exit loop).
+	     Final value can be an immediate or can be a register, which
+	     constant initialization we find in preheader.  */
+	  was_immediate = false;
+	  if (!doloop_p && count_fin_isconst && cmp_stage > 0)
+	    {
+              gcc_assert (0 <= cmp_side && cmp_side <= 1);
+	      /* New finish value.  */
+	      count_fin_newval = count_fin_val - count_step * cmp_stage;
+	      was_immediate = CONST_INT_P (XEXP (SET_SRC (single_set (cmp)),
+							  1 - cmp_side));
+	      if (was_immediate)
+		{
+		  /* Check whether new value also can be an immediate.
+		     For exapmle, on ARM not all values can be encoded as
+		     an immediate, so we have to load it to a register once
+		     before the loop starts.  */
+		  rtx to = GEN_INT (count_fin_newval);
+		  prolog_create_reg = rtx_cost (to, GET_CODE (to), 0, false)
+			    > rtx_cost (GEN_INT(1), CONST_INT, 0, false);
+	        }
+	      else
+		{
+		  /* A value is already in a register and we easily change
+		     initialization instruction in preheader.  */
+		  gcc_assert (count_fin_init_insn);
+		  SET_SRC (single_set (count_fin_init_insn))
+			= GEN_INT (count_fin_newval);
+		  df_insn_rescan (count_fin_init_insn);
+		}
+	    }
+
+	  /* The adjustment of finish register value.
+	     Zero means no adjustment needed or adjusment is done
+	     without additional insn in prologue.  */
+	  if (!doloop_p && !count_fin_isconst)
+	    prolog_fin_nonconst_adjust = count_step * cmp_stage;
+
+	  /* Ready to generate prolog and epilog.  */
+	  generate_prolog_epilog (ps, loop, count_reg, doloop_p,
+			          count_init_isconst, count_fin_reg,
+				  prolog_fin_nonconst_adjust,
+				  prolog_create_reg, count_fin_newval,
+				  &new_comp_reg);
+
+	  /* And only after generating prolog and epilog it is possible
+	     to modify the compare instruction (to prevent copying wrong insn
+	     form to first and last stages).  */
+	  if (!doloop_p && cmp_stage > 0)
+	    {
+              gcc_assert (0 <= cmp_side && cmp_side <= 1);
+	      if (was_immediate && !prolog_create_reg)
+		{
+		/* Easy case - just modify a constant.  */
+		  gcc_assert (new_comp_reg == NULL_RTX);
+		  XEXP (SET_SRC (single_set (cmp)), 1 - cmp_side)
+			= GEN_INT (count_fin_newval);
+		}
+	      else
+		{
+		  if (count_fin_isconst && !was_immediate)
+		    /* Value is in a register and we already changed
+		       initialization instruction in preheader.  */
+		    gcc_assert (new_comp_reg == NULL_RTX);
+		  else
+		    {
+		      /* Another case - use created by generate_prolog_epilog
+		         register, which value is initialized in prologue.  */
+		      gcc_assert (new_comp_reg != NULL_RTX);
+		      XEXP (SET_SRC (single_set (cmp)), 1 - cmp_side)
+			      = new_comp_reg;
+		    }
+		}
+	      df_insn_rescan (cmp);
+	    }
+	  else
+	    gcc_assert (new_comp_reg == NULL_RTX);
+
 	  break;
 	}
 
@@ -1752,7 +2186,9 @@  sms_schedule (void)
       free_ddg (g);
     }
 
+  df_set_flags (DF_LR_RUN_DCE);
   free (g_arr);
+  iv_analysis_done ();
 
   /* Release scheduler data, needed until now because of DFA.  */
   haifa_sched_finish ();