diff mbox series

[gimple-linterchange] Rewrite compute_access_stride

Message ID alpine.LSU.2.20.1712011325250.12252@zhemvz.fhfr.qr
State New
Headers show
Series [gimple-linterchange] Rewrite compute_access_stride | expand

Commit Message

Richard Biener Dec. 1, 2017, 12:31 p.m. UTC
This is the access stride computation change.  Apart from the
stride extraction I adjusted the cost model to handle non-constant
strides by checking if either is a multiple of the other and
simply fail interchanging if it's the wrong way around for one
ref or if the simple method using multiple_of_p fails to determine
either case.

This still handles the bwaves case.

I think we want additional testcases with variable strides for each
case we add - I believe this is the most conservative way to treat
variable strides.

It may be inconsistent with the constant stride handling where you
allow for many OK DRs to outweight a few not OK DRs, but as it
worked for bwaves it must be good enough ;)

Tested on x86_64-unknown-linux-gnu (just the interchange testcases).

Currently running a bootstrap with -O3 -g -floop-interchange.

Ok for the branch?

Richard.

2017-12-01  Richard Biener  <rguenther@suse.de>

	* gimple-loop-interchange.cc (estimate_val_by_simplify_replace):
	Remove.
	(compute_access_stride): Rewrite using instantiate_scev,
	remove constant substitution.
	(should_interchange_loops): Adjust for non-constant strides.

Comments

Bin.Cheng Dec. 1, 2017, 12:54 p.m. UTC | #1
On Fri, Dec 1, 2017 at 12:31 PM, Richard Biener <rguenther@suse.de> wrote:
>
> This is the access stride computation change.  Apart from the
> stride extraction I adjusted the cost model to handle non-constant
> strides by checking if either is a multiple of the other and
> simply fail interchanging if it's the wrong way around for one
> ref or if the simple method using multiple_of_p fails to determine
> either case.
>
> This still handles the bwaves case.
>
> I think we want additional testcases with variable strides for each
> case we add - I believe this is the most conservative way to treat
> variable strides.
>
> It may be inconsistent with the constant stride handling where you
> allow for many OK DRs to outweight a few not OK DRs, but as it
> worked for bwaves it must be good enough ;)
>
> Tested on x86_64-unknown-linux-gnu (just the interchange testcases).
>
> Currently running a bootstrap with -O3 -g -floop-interchange.
>
> Ok for the branch?
Ok.  This actually is closer the motivation: simple/conservative cost
model that only transforms code when it's known to be good.
I will check the impact on the number of interchange in spec.

Thanks,
bin
>
> Richard.
>
> 2017-12-01  Richard Biener  <rguenther@suse.de>
>
>         * gimple-loop-interchange.cc (estimate_val_by_simplify_replace):
>         Remove.
>         (compute_access_stride): Rewrite using instantiate_scev,
>         remove constant substitution.
>         (should_interchange_loops): Adjust for non-constant strides.
>
> Index: gcc/gimple-loop-interchange.cc
> ===================================================================
> --- gcc/gimple-loop-interchange.cc      (revision 255303)
> +++ gcc/gimple-loop-interchange.cc      (working copy)
> @@ -1325,42 +1325,6 @@ tree_loop_interchange::move_code_to_inne
>      }
>  }
>
> -/* Estimate and return the value of EXPR by replacing variables in EXPR
> -   with CST_TREE and simplifying.  */
> -
> -static tree
> -estimate_val_by_simplify_replace (tree expr, tree cst_tree)
> -{
> -  unsigned i, n;
> -  tree ret = NULL_TREE, e, se;
> -
> -  if (!expr)
> -    return NULL_TREE;
> -
> -  /* Do not bother to replace constants.  */
> -  if (CONSTANT_CLASS_P (expr))
> -    return expr;
> -
> -  if (!EXPR_P (expr))
> -    return cst_tree;
> -
> -  n = TREE_OPERAND_LENGTH (expr);
> -  for (i = 0; i < n; i++)
> -    {
> -      e = TREE_OPERAND (expr, i);
> -      se = estimate_val_by_simplify_replace (e, cst_tree);
> -      if (e == se)
> -       continue;
> -
> -      if (!ret)
> -       ret = copy_node (expr);
> -
> -      TREE_OPERAND (ret, i) = se;
> -    }
> -
> -  return (ret ? fold (ret) : expr);
> -}
> -
>  /* Given data reference DR in LOOP_NEST, the function computes DR's access
>     stride at each level of loop from innermost LOOP to outer.  On success,
>     it saves access stride at each level loop in a vector which is pointed
> @@ -1388,44 +1352,31 @@ compute_access_stride (struct loop *loop
>
>    tree ref = DR_REF (dr);
>    tree scev_base = build_fold_addr_expr (ref);
> -  tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
> -  tree niters = build_int_cst (sizetype, AVG_LOOP_NITER);
> -  access_size = fold_build2 (MULT_EXPR, sizetype, niters, access_size);
> -
> -  do {
> -    tree scev_fn = analyze_scalar_evolution (loop, scev_base);
> -    if (chrec_contains_undetermined (scev_fn)
> -       || chrec_contains_symbols_defined_in_loop (scev_fn, loop->num))
> -      break;
> -
> -    if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
> -      {
> -       scev_base = scev_fn;
> -       strides->safe_push (build_int_cst (sizetype, 0));
> -       continue;
> -      }
> -
> -    scev_base = CHREC_LEFT (scev_fn);
> -    if (tree_contains_chrecs (scev_base, NULL))
> -      break;
> -
> -    tree scev_step = fold_convert (sizetype, CHREC_RIGHT (scev_fn));
> -
> -    enum ev_direction scev_dir = scev_direction (scev_fn);
> -    /* Estimate if step isn't constant.  */
> -    if (scev_dir == EV_DIR_UNKNOWN)
> -      {
> -       scev_step = estimate_val_by_simplify_replace (scev_step, niters);
> -       if (TREE_CODE (scev_step) != INTEGER_CST
> -           || tree_int_cst_lt (scev_step, access_size))
> -         scev_step = access_size;
> -      }
> -    /* Compute absolute value of scev step.  */
> -    else if (scev_dir == EV_DIR_DECREASES)
> -      scev_step = fold_build1 (NEGATE_EXPR, sizetype, scev_step);
> -
> -    strides->safe_push (scev_step);
> -  } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
> +  tree scev = analyze_scalar_evolution (loop, scev_base);
> +  scev = instantiate_scev (loop_preheader_edge (loop_nest), loop, scev);
> +  if (! chrec_contains_undetermined (scev))
> +    {
> +      tree sl = scev;
> +      struct loop *expected = loop;
> +      while (TREE_CODE (sl) == POLYNOMIAL_CHREC)
> +       {
> +         struct loop *sl_loop = get_chrec_loop (sl);
> +         while (sl_loop != expected)
> +           {
> +             strides->safe_push (size_int (0));
> +             expected = loop_outer (expected);
> +           }
> +         strides->safe_push (CHREC_RIGHT (sl));
> +         sl = CHREC_LEFT (sl);
> +         expected = loop_outer (expected);
> +       }
> +      if (! tree_contains_chrecs (sl, NULL))
> +       while (expected != loop_outer (loop_nest))
> +         {
> +           strides->safe_push (size_int (0));
> +           expected = loop_outer (expected);
> +         }
> +    }
>
>    dr->aux = strides;
>  }
> @@ -1538,6 +1489,9 @@ should_interchange_loops (unsigned i_idx
>    struct data_reference *dr;
>    bool all_seq_dr_before_p = true, all_seq_dr_after_p = true;
>    widest_int iloop_strides = 0, oloop_strides = 0;
> +  unsigned num_unresolved_drs = 0;
> +  unsigned num_resolved_ok_drs = 0;
> +  unsigned num_resolved_not_ok_drs = 0;
>
>    if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS))
>      fprintf (dump_file, "\nData ref strides:\n\tmem_ref:\t\tiloop\toloop\n");
> @@ -1546,28 +1500,42 @@ should_interchange_loops (unsigned i_idx
>      {
>        vec<tree> *stride = DR_ACCESS_STRIDE (dr);
>        tree iloop_stride = (*stride)[i_idx], oloop_stride = (*stride)[o_idx];
> -      gcc_assert (TREE_CODE (iloop_stride) == INTEGER_CST);
> -      gcc_assert (TREE_CODE (oloop_stride) == INTEGER_CST);
>
>        bool subloop_stride_p = false;
>        /* Data ref can't be invariant or sequential access at current loop if
>          its address changes with respect to any subloops.  */
>        for (j = i_idx + 1; j < stride->length (); ++j)
> -       if (integer_nonzerop ((*stride)[j]))
> +       if (!integer_zerop ((*stride)[j]))
>           {
>             subloop_stride_p = true;
>             break;
>           }
>
> -      if (integer_nonzerop (iloop_stride))
> -       iloop_strides = wi::add (iloop_strides, wi::to_widest (iloop_stride));
> -      else if (!subloop_stride_p)
> -       num_old_inv_drs++;
> -
> -      if (integer_nonzerop (oloop_stride))
> -       oloop_strides = wi::add (oloop_strides, wi::to_widest (oloop_stride));
> -      else if (!subloop_stride_p)
> -       num_new_inv_drs++;
> +      if (integer_zerop (iloop_stride))
> +       {
> +         if (!subloop_stride_p)
> +           num_old_inv_drs++;
> +       }
> +      if (integer_zerop (oloop_stride))
> +       {
> +         if (!subloop_stride_p)
> +           num_new_inv_drs++;
> +       }
> +
> +      if (TREE_CODE (iloop_stride) == INTEGER_CST
> +         && TREE_CODE (oloop_stride) == INTEGER_CST)
> +       {
> +         iloop_strides = wi::add (iloop_strides, wi::to_widest (iloop_stride));
> +         oloop_strides = wi::add (oloop_strides, wi::to_widest (oloop_stride));
> +       }
> +      else if (multiple_of_p (TREE_TYPE (iloop_stride),
> +                             iloop_stride, oloop_stride))
> +       num_resolved_ok_drs++;
> +      else if (multiple_of_p (TREE_TYPE (iloop_stride),
> +                             oloop_stride, iloop_stride))
> +       num_resolved_not_ok_drs++;
> +      else
> +       num_unresolved_drs++;
>
>        /* Data ref can't be sequential access if its address changes in sub
>          loop.  */
> @@ -1581,10 +1549,12 @@ should_interchange_loops (unsigned i_idx
>          interchange.  Note invariant is considered sequential here.  */
>        tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
>        if (all_seq_dr_before_p
> -         && wi::gtu_p (wi::to_wide (iloop_stride), wi::to_wide (access_size)))
> +         && ! (integer_zerop (iloop_stride)
> +               || operand_equal_p (access_size, iloop_stride, 0)))
>         all_seq_dr_before_p = false;
>        if (all_seq_dr_after_p
> -         && wi::gtu_p (wi::to_wide (oloop_stride), wi::to_wide (access_size)))
> +         && ! (integer_zerop (oloop_stride)
> +               || operand_equal_p (access_size, oloop_stride, 0)))
>         all_seq_dr_after_p = false;
>      }
>
> @@ -1601,8 +1571,17 @@ should_interchange_loops (unsigned i_idx
>        fprintf (dump_file, "All consecutive stride: before(%s), after(%s)\n",
>                all_seq_dr_before_p ? "true" : "false",
>                all_seq_dr_after_p ? "true" : "false");
> +      fprintf (dump_file, "OK to interchage with variable strides: %d\n",
> +              num_resolved_ok_drs);
> +      fprintf (dump_file, "Not OK to interchage with variable strides: %d\n",
> +              num_resolved_not_ok_drs);
> +      fprintf (dump_file, "Variable strides we cannot decide: %d\n",
> +              num_unresolved_drs);
>      }
>
> +  if (num_unresolved_drs != 0 || num_resolved_not_ok_drs != 0)
> +    return false;
> +
>    /* We use different stride comparison ratio for interchanging innermost
>       two loops or not.  The idea is to be conservative in interchange for
>       the innermost loops.  */
Richard Biener Dec. 1, 2017, 2:26 p.m. UTC | #2
On Fri, 1 Dec 2017, Bin.Cheng wrote:

> On Fri, Dec 1, 2017 at 12:31 PM, Richard Biener <rguenther@suse.de> wrote:
> >
> > This is the access stride computation change.  Apart from the
> > stride extraction I adjusted the cost model to handle non-constant
> > strides by checking if either is a multiple of the other and
> > simply fail interchanging if it's the wrong way around for one
> > ref or if the simple method using multiple_of_p fails to determine
> > either case.
> >
> > This still handles the bwaves case.
> >
> > I think we want additional testcases with variable strides for each
> > case we add - I believe this is the most conservative way to treat
> > variable strides.
> >
> > It may be inconsistent with the constant stride handling where you
> > allow for many OK DRs to outweight a few not OK DRs, but as it
> > worked for bwaves it must be good enough ;)
> >
> > Tested on x86_64-unknown-linux-gnu (just the interchange testcases).
> >
> > Currently running a bootstrap with -O3 -g -floop-interchange.
> >
> > Ok for the branch?
> Ok.  This actually is closer the motivation: simple/conservative cost
> model that only transforms code when it's known to be good.
> I will check the impact on the number of interchange in spec.

Few high-level observations.

In tree_loop_interchange::interchange we try interchanging adjacent
loops, starting from innermost with outer of innermost.  This way
the innermost loop will bubble up as much as possible.  But we
don't seem to handle bulling multiple loops like for

 for (i=0; i<n; ++i)
  for (j=0; j<n; ++j)
    for (k=0; k<n ++k)
      a[j][k][i] = 1.;

because the innermost two loops are correctly ordered so we then
try interchanging the k and the i loop which succeeds but then
we stop.  So there's something wrong in the iteration scheme.
I would have expected it to be quadratic, basically iterating
the ::interchange loop until we didn't perform any interchange
(or doing sth more clever).

loop_cand::can_interchange_p seems to perform per BB checks
(supported_operations, num_stmts) that with the way we interchange
should disallow any such BB in a loop that we interchange or
interchange across.  That means it looks like sth we should
perform early, like during data dependence gathering by for
example inlining find_data_references_in_bb and doing those
per-stmt checks there?

In prepare_perfect_loop_nest we seem to be somewhat quadratic
in the way we re-compute dependences if doing so failed
(we also always just strip the outermost loop while the failing
DDR could involve a DR that is in an inner loop).  I think it
should be possible to re-structure this computing dependences
from inner loop body to outer loop bodies (the ddrs vector
is, as opposed to the dr vector, unsorted I think).
I haven't fully thought this out yet though - a similar
iteration scheme could improve DR gathering though that's not
so costly.

Overall we should try improving on function names, we have
valid_data_dependences, can_interchange_loops, should_interchange_loops,
can_interchange_p which all are related but do slightly different
things.  My usual approach is to inline all single-use functions
to improve things (and make program flow more visible).  But
I guess that's too much implementation detail.

Didn't get to the IV re-mapping stuff yet but you (of course)
end up with quite some dead IVs when iterating the interchange.
You seem to add a new canonical IV just to avoid rewriting the
existing exit test, right?  Defering that to the "final"
interchange on a nest should avoid those dead IVs.

Will now leave for the weekend.

Thanks,
Richard.
Bin.Cheng Dec. 1, 2017, 3 p.m. UTC | #3
On Fri, Dec 1, 2017 at 2:26 PM, Richard Biener <rguenther@suse.de> wrote:
> On Fri, 1 Dec 2017, Bin.Cheng wrote:
>
>> On Fri, Dec 1, 2017 at 12:31 PM, Richard Biener <rguenther@suse.de> wrote:
>> >
>> > This is the access stride computation change.  Apart from the
>> > stride extraction I adjusted the cost model to handle non-constant
>> > strides by checking if either is a multiple of the other and
>> > simply fail interchanging if it's the wrong way around for one
>> > ref or if the simple method using multiple_of_p fails to determine
>> > either case.
>> >
>> > This still handles the bwaves case.
>> >
>> > I think we want additional testcases with variable strides for each
>> > case we add - I believe this is the most conservative way to treat
>> > variable strides.
>> >
>> > It may be inconsistent with the constant stride handling where you
>> > allow for many OK DRs to outweight a few not OK DRs, but as it
>> > worked for bwaves it must be good enough ;)
>> >
>> > Tested on x86_64-unknown-linux-gnu (just the interchange testcases).
>> >
>> > Currently running a bootstrap with -O3 -g -floop-interchange.
>> >
>> > Ok for the branch?
>> Ok.  This actually is closer the motivation: simple/conservative cost
>> model that only transforms code when it's known to be good.
>> I will check the impact on the number of interchange in spec.
>
> Few high-level observations.
>
> In tree_loop_interchange::interchange we try interchanging adjacent
> loops, starting from innermost with outer of innermost.  This way
> the innermost loop will bubble up as much as possible.  But we
> don't seem to handle bulling multiple loops like for
>
>  for (i=0; i<n; ++i)
>   for (j=0; j<n; ++j)
>     for (k=0; k<n ++k)
>       a[j][k][i] = 1.;
>
> because the innermost two loops are correctly ordered so we then
> try interchanging the k and the i loop which succeeds but then
> we stop.  So there's something wrong in the iteration scheme.
> I would have expected it to be quadratic, basically iterating
> the ::interchange loop until we didn't perform any interchange
> (or doing sth more clever).
Yes, I restricted it to be a single pass process in loop nest.
Ideally we could create a vector of loop_cand for the whole loop nest,
then sort/permute loops wrto computed strides.

>
> loop_cand::can_interchange_p seems to perform per BB checks
> (supported_operations, num_stmts) that with the way we interchange
> should disallow any such BB in a loop that we interchange or
> interchange across.  That means it looks like sth we should
> perform early, like during data dependence gathering by for
> example inlining find_data_references_in_bb and doing those
> per-stmt checks there?
Yes.  The only problem is the check on the reduction.  We can build up
all loop_cand earlier, or simply move non-reduction checks earlier.

>
> In prepare_perfect_loop_nest we seem to be somewhat quadratic
> in the way we re-compute dependences if doing so failed
> (we also always just strip the outermost loop while the failing
> DDR could involve a DR that is in an inner loop).  I think it
> should be possible to re-structure this computing dependences
> from inner loop body to outer loop bodies (the ddrs vector
> is, as opposed to the dr vector, unsorted I think).
Even more, we would want interface in tree-data-ref.c so that data
dependence can be computed/assembled level by level in loop nest.
loop distribution can be benefited as well.

> I haven't fully thought this out yet though - a similar
> iteration scheme could improve DR gathering though that's not
> so costly.
I can change this one now.

>
> Overall we should try improving on function names, we have
> valid_data_dependences, can_interchange_loops, should_interchange_loops,
> can_interchange_p which all are related but do slightly different
> things.  My usual approach is to inline all single-use functions
> to improve things (and make program flow more visible).  But
> I guess that's too much implementation detail.
>
> Didn't get to the IV re-mapping stuff yet but you (of course)
> end up with quite some dead IVs when iterating the interchange.
> You seem to add a new canonical IV just to avoid rewriting the
> existing exit test, right?  Defering that to the "final"
> interchange on a nest should avoid those dead IVs.
Hmm, with the help pf new created dce interface, all dead IV/RE will
be deleted after this pass.  Note for current implementation, dead
code can be generated from mapped IV, canonical IV and the reduction.
Deferring adding canonical IV looks not practical wrto current level
by level interchange, because inner loop's IV is needed for
interchange?

>
> Will now leave for the weekend.
Have a nice WE!

Thanks,
bin
>
> Thanks,
> Richard.
diff mbox series

Patch

Index: gcc/gimple-loop-interchange.cc
===================================================================
--- gcc/gimple-loop-interchange.cc	(revision 255303)
+++ gcc/gimple-loop-interchange.cc	(working copy)
@@ -1325,42 +1325,6 @@  tree_loop_interchange::move_code_to_inne
     }
 }
 
-/* Estimate and return the value of EXPR by replacing variables in EXPR
-   with CST_TREE and simplifying.  */
-
-static tree
-estimate_val_by_simplify_replace (tree expr, tree cst_tree)
-{
-  unsigned i, n;
-  tree ret = NULL_TREE, e, se;
-
-  if (!expr)
-    return NULL_TREE;
-
-  /* Do not bother to replace constants.  */
-  if (CONSTANT_CLASS_P (expr))
-    return expr;
-
-  if (!EXPR_P (expr))
-    return cst_tree;
-
-  n = TREE_OPERAND_LENGTH (expr);
-  for (i = 0; i < n; i++)
-    {
-      e = TREE_OPERAND (expr, i);
-      se = estimate_val_by_simplify_replace (e, cst_tree);
-      if (e == se)
-	continue;
-
-      if (!ret)
-	ret = copy_node (expr);
-
-      TREE_OPERAND (ret, i) = se;
-    }
-
-  return (ret ? fold (ret) : expr);
-}
-
 /* Given data reference DR in LOOP_NEST, the function computes DR's access
    stride at each level of loop from innermost LOOP to outer.  On success,
    it saves access stride at each level loop in a vector which is pointed
@@ -1388,44 +1352,31 @@  compute_access_stride (struct loop *loop
 
   tree ref = DR_REF (dr);
   tree scev_base = build_fold_addr_expr (ref);
-  tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
-  tree niters = build_int_cst (sizetype, AVG_LOOP_NITER);
-  access_size = fold_build2 (MULT_EXPR, sizetype, niters, access_size);
-
-  do {
-    tree scev_fn = analyze_scalar_evolution (loop, scev_base);
-    if (chrec_contains_undetermined (scev_fn)
-	|| chrec_contains_symbols_defined_in_loop (scev_fn, loop->num))
-      break;
-
-    if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
-      {
-	scev_base = scev_fn;
-	strides->safe_push (build_int_cst (sizetype, 0));
-	continue;
-      }
-
-    scev_base = CHREC_LEFT (scev_fn);
-    if (tree_contains_chrecs (scev_base, NULL))
-      break;
-
-    tree scev_step = fold_convert (sizetype, CHREC_RIGHT (scev_fn));
-
-    enum ev_direction scev_dir = scev_direction (scev_fn);
-    /* Estimate if step isn't constant.  */
-    if (scev_dir == EV_DIR_UNKNOWN)
-      {
-	scev_step = estimate_val_by_simplify_replace (scev_step, niters);
-	if (TREE_CODE (scev_step) != INTEGER_CST
-	    || tree_int_cst_lt (scev_step, access_size))
-	  scev_step = access_size;
-      }
-    /* Compute absolute value of scev step.  */
-    else if (scev_dir == EV_DIR_DECREASES)
-      scev_step = fold_build1 (NEGATE_EXPR, sizetype, scev_step);
-
-    strides->safe_push (scev_step);
-  } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
+  tree scev = analyze_scalar_evolution (loop, scev_base);
+  scev = instantiate_scev (loop_preheader_edge (loop_nest), loop, scev);
+  if (! chrec_contains_undetermined (scev))
+    {
+      tree sl = scev;
+      struct loop *expected = loop;
+      while (TREE_CODE (sl) == POLYNOMIAL_CHREC)
+	{
+	  struct loop *sl_loop = get_chrec_loop (sl);
+	  while (sl_loop != expected)
+	    {
+	      strides->safe_push (size_int (0));
+	      expected = loop_outer (expected);
+	    }
+	  strides->safe_push (CHREC_RIGHT (sl));
+	  sl = CHREC_LEFT (sl);
+	  expected = loop_outer (expected);
+	}
+      if (! tree_contains_chrecs (sl, NULL))
+	while (expected != loop_outer (loop_nest))
+	  {
+	    strides->safe_push (size_int (0));
+	    expected = loop_outer (expected);
+	  }
+    }
 
   dr->aux = strides;
 }
@@ -1538,6 +1489,9 @@  should_interchange_loops (unsigned i_idx
   struct data_reference *dr;
   bool all_seq_dr_before_p = true, all_seq_dr_after_p = true;
   widest_int iloop_strides = 0, oloop_strides = 0;
+  unsigned num_unresolved_drs = 0;
+  unsigned num_resolved_ok_drs = 0;
+  unsigned num_resolved_not_ok_drs = 0;
 
   if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS))
     fprintf (dump_file, "\nData ref strides:\n\tmem_ref:\t\tiloop\toloop\n");
@@ -1546,28 +1500,42 @@  should_interchange_loops (unsigned i_idx
     {
       vec<tree> *stride = DR_ACCESS_STRIDE (dr);
       tree iloop_stride = (*stride)[i_idx], oloop_stride = (*stride)[o_idx];
-      gcc_assert (TREE_CODE (iloop_stride) == INTEGER_CST);
-      gcc_assert (TREE_CODE (oloop_stride) == INTEGER_CST);
 
       bool subloop_stride_p = false;
       /* Data ref can't be invariant or sequential access at current loop if
 	 its address changes with respect to any subloops.  */
       for (j = i_idx + 1; j < stride->length (); ++j)
-	if (integer_nonzerop ((*stride)[j]))
+	if (!integer_zerop ((*stride)[j]))
 	  {
 	    subloop_stride_p = true;
 	    break;
 	  }
 
-      if (integer_nonzerop (iloop_stride))
-	iloop_strides = wi::add (iloop_strides, wi::to_widest (iloop_stride));
-      else if (!subloop_stride_p)
-	num_old_inv_drs++;
-
-      if (integer_nonzerop (oloop_stride))
-	oloop_strides = wi::add (oloop_strides, wi::to_widest (oloop_stride));
-      else if (!subloop_stride_p)
-	num_new_inv_drs++;
+      if (integer_zerop (iloop_stride))
+	{
+	  if (!subloop_stride_p)
+	    num_old_inv_drs++;
+	}
+      if (integer_zerop (oloop_stride))
+	{
+	  if (!subloop_stride_p)
+	    num_new_inv_drs++;
+	}
+
+      if (TREE_CODE (iloop_stride) == INTEGER_CST
+	  && TREE_CODE (oloop_stride) == INTEGER_CST)
+	{
+	  iloop_strides = wi::add (iloop_strides, wi::to_widest (iloop_stride));
+	  oloop_strides = wi::add (oloop_strides, wi::to_widest (oloop_stride));
+	}
+      else if (multiple_of_p (TREE_TYPE (iloop_stride),
+			      iloop_stride, oloop_stride))
+	num_resolved_ok_drs++;
+      else if (multiple_of_p (TREE_TYPE (iloop_stride),
+			      oloop_stride, iloop_stride))
+	num_resolved_not_ok_drs++;
+      else
+	num_unresolved_drs++;
 
       /* Data ref can't be sequential access if its address changes in sub
 	 loop.  */
@@ -1581,10 +1549,12 @@  should_interchange_loops (unsigned i_idx
 	 interchange.  Note invariant is considered sequential here.  */
       tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
       if (all_seq_dr_before_p
-	  && wi::gtu_p (wi::to_wide (iloop_stride), wi::to_wide (access_size)))
+	  && ! (integer_zerop (iloop_stride)
+		|| operand_equal_p (access_size, iloop_stride, 0)))
 	all_seq_dr_before_p = false;
       if (all_seq_dr_after_p
-	  && wi::gtu_p (wi::to_wide (oloop_stride), wi::to_wide (access_size)))
+	  && ! (integer_zerop (oloop_stride)
+		|| operand_equal_p (access_size, oloop_stride, 0)))
 	all_seq_dr_after_p = false;
     }
 
@@ -1601,8 +1571,17 @@  should_interchange_loops (unsigned i_idx
       fprintf (dump_file, "All consecutive stride: before(%s), after(%s)\n",
 	       all_seq_dr_before_p ? "true" : "false",
 	       all_seq_dr_after_p ? "true" : "false");
+      fprintf (dump_file, "OK to interchage with variable strides: %d\n",
+	       num_resolved_ok_drs);
+      fprintf (dump_file, "Not OK to interchage with variable strides: %d\n",
+	       num_resolved_not_ok_drs);
+      fprintf (dump_file, "Variable strides we cannot decide: %d\n",
+	       num_unresolved_drs);
     }
 
+  if (num_unresolved_drs != 0 || num_resolved_not_ok_drs != 0)
+    return false;
+
   /* We use different stride comparison ratio for interchanging innermost
      two loops or not.  The idea is to be conservative in interchange for
      the innermost loops.  */