diff mbox series

Fix PR90796

Message ID alpine.LSU.2.21.1910220930570.30048@wotan.suse.de
State New
Headers show
Series Fix PR90796 | expand

Commit Message

Michael Matz Oct. 22, 2019, 9:36 a.m. UTC
Hi,

this was still collecting dust on my disk, so here it is.  See the 
extensive comment in the patch for what happens, in short invariant IVs 
are affine but still have to be handled more conservative than other 
affine IVs in transformations that reorder instructions.  Making our 
dependence analysis more conservative instead would be too much, we 
wouldn't be able to handle cases that we should handle anymore.

Regstrapped on x86-64-linux without regressions (all languages+Ada).


Ciao,
Michael.

	PR middle-end/90796
	* gimple-loop-jam.c (any_access_function_variant_p): New function.
	(adjust_unroll_factor): Use it to constrain safety, new parameter.
	(tree_loop_unroll_and_jam): Adjust call and profitable unroll factor.

testsuite/
	* gcc.dg/unroll-and-jam.c: Add three invalid and one valid case.

Comments

Richard Biener Oct. 22, 2019, 12:08 p.m. UTC | #1
On Tue, Oct 22, 2019 at 11:36 AM Michael Matz <matz@suse.de> wrote:
>
> Hi,
>
> this was still collecting dust on my disk, so here it is.  See the
> extensive comment in the patch for what happens, in short invariant IVs
> are affine but still have to be handled more conservative than other
> affine IVs in transformations that reorder instructions.  Making our
> dependence analysis more conservative instead would be too much, we
> wouldn't be able to handle cases that we should handle anymore.
>
> Regstrapped on x86-64-linux without regressions (all languages+Ada).

OK for trunk and affected branches.

Thanks,
Richard.

>
> Ciao,
> Michael.
>
>         PR middle-end/90796
>         * gimple-loop-jam.c (any_access_function_variant_p): New function.
>         (adjust_unroll_factor): Use it to constrain safety, new parameter.
>         (tree_loop_unroll_and_jam): Adjust call and profitable unroll factor.
>
> testsuite/
>         * gcc.dg/unroll-and-jam.c: Add three invalid and one valid case.
>
> diff --git a/gcc/gimple-loop-jam.c b/gcc/gimple-loop-jam.c
> index 11153f5..899653b 100644
> --- a/gcc/gimple-loop-jam.c
> +++ b/gcc/gimple-loop-jam.c
> @@ -360,16 +360,33 @@ fuse_loops (class loop *loop)
>    rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
>  }
>
> +/* Return true if any of the access functions for dataref A
> +   isn't invariant with respect to loop LOOP_NEST.  */
> +static bool
> +any_access_function_variant_p (const struct data_reference *a,
> +                              const class loop *loop_nest)
> +{
> +  unsigned int i;
> +  vec<tree> fns = DR_ACCESS_FNS (a);
> +  tree t;
> +
> +  FOR_EACH_VEC_ELT (fns, i, t)
> +    if (!evolution_function_is_invariant_p (t, loop_nest->num))
> +      return true;
> +
> +  return false;
> +}
> +
>  /* Returns true if the distance in DDR can be determined and adjusts
>     the unroll factor in *UNROLL to make unrolling valid for that distance.
> -   Otherwise return false.
> +   Otherwise return false.  DDR is with respect to the outer loop of INNER.
>
>     If this data dep can lead to a removed memory reference, increment
>     *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
>     for this to happen.  */
>
>  static bool
> -adjust_unroll_factor (struct data_dependence_relation *ddr,
> +adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr,
>                       unsigned *unroll, unsigned *profit_unroll,
>                       unsigned *removed)
>  {
> @@ -392,9 +409,59 @@ adjust_unroll_factor (struct data_dependence_relation *ddr,
>             gcc_unreachable ();
>           else if ((unsigned)dist >= *unroll)
>             ;
> -         else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
> -                  || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
> -                      && dist > 0))
> +         else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
> +           {
> +             /* We have (a,0) with a < N, so this will be transformed into
> +                (0,0) after unrolling by N.  This might potentially be a
> +                problem, if it's not a read-read dependency.  */
> +             if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))
> +               ;
> +             else
> +               {
> +                 /* So, at least one is a write, and we might reduce the
> +                    distance vector to (0,0).  This is still no problem
> +                    if both data-refs are affine with respect to the inner
> +                    loops.  But if one of them is invariant with respect
> +                    to an inner loop our reordering implicit in loop fusion
> +                    corrupts the program, as our data dependences don't
> +                    capture this.  E.g. for:
> +                      for (0 <= i < n)
> +                        for (0 <= j < m)
> +                          a[i][0] = a[i+1][0] + 2;    // (1)
> +                          b[i][j] = b[i+1][j] + 2;    // (2)
> +                    the distance vector for both statements is (-1,0),
> +                    but exchanging the order for (2) is okay, while
> +                    for (1) it is not.  To see this, write out the original
> +                    accesses (assume m is 2):
> +                      a i j original
> +                      0 0 0 r a[1][0] b[1][0]
> +                      1 0 0 w a[0][0] b[0][0]
> +                      2 0 1 r a[1][0] b[1][1]
> +                      3 0 1 w a[0][0] b[0][1]
> +                      4 1 0 r a[2][0] b[2][0]
> +                      5 1 0 w a[1][0] b[1][0]
> +                    after unroll-by-2 and fusion the accesses are done in
> +                    this order (from column a): 0,1, 4,5, 2,3, i.e. this:
> +                      a i j transformed
> +                      0 0 0 r a[1][0] b[1][0]
> +                      1 0 0 w a[0][0] b[0][0]
> +                      4 1 0 r a[2][0] b[2][0]
> +                      5 1 0 w a[1][0] b[1][0]
> +                      2 0 1 r a[1][0] b[1][1]
> +                      3 0 1 w a[0][0] b[0][1]
> +                    Note how access 2 accesses the same element as access 5
> +                    for array 'a' but not for array 'b'.  */
> +                 if (any_access_function_variant_p (DDR_A (ddr), inner)
> +                     && any_access_function_variant_p (DDR_B (ddr), inner))
> +                   ;
> +                 else
> +                   /* And if any dataref of this pair is invariant with
> +                      respect to the inner loop, we have no chance than
> +                      to reduce the unroll factor.  */
> +                   *unroll = dist;
> +               }
> +           }
> +         else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
>             ;
>           else
>             *unroll = dist;
> @@ -486,7 +553,7 @@ tree_loop_unroll_and_jam (void)
>           /* Now check the distance vector, for determining a sensible
>              outer unroll factor, and for validity of merging the inner
>              loop copies.  */
> -         if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
> +         if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll,
>                                      &removed))
>             {
>               /* Couldn't get the distance vector.  For two reads that's
> @@ -506,7 +573,7 @@ tree_loop_unroll_and_jam (void)
>          to ignore all profitability concerns and apply the transformation
>          always.  */
>        if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
> -       profit_unroll = 2;
> +       profit_unroll = MAX(2, profit_unroll);
>        else if (removed * 100 / datarefs.length ()
>           < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
>         profit_unroll = 1;
> diff --git a/gcc/testsuite/gcc.dg/unroll-and-jam.c b/gcc/testsuite/gcc.dg/unroll-and-jam.c
> index 70910d3..bcfe1bd 100644
> --- a/gcc/testsuite/gcc.dg/unroll-and-jam.c
> +++ b/gcc/testsuite/gcc.dg/unroll-and-jam.c
> @@ -31,10 +31,10 @@ void checkb(void)
>    //printf("  %d\n", sum);
>  }
>
> +unsigned i, j;
>  #define TEST(name, body, test) \
>  static void __attribute__((noinline,noclone)) name (unsigned long n, unsigned long m) \
>  { \
> -  unsigned long i, j; \
>    for (i = 1; i < m; i++) { \
>        for (j = 1; j < n; j++) { \
>           body; \
> @@ -58,9 +58,14 @@ TEST(foo3, aa[i+1][j-1]=aa[i][j] * aa[i][j] / 2, checkaa()) //notok, -1,1
>  TEST(foo4, aa[i][j] = aa[i-1][j+1] * aa[i-1][j+1] / 2, checkaa()) //notok, -1,1
>  TEST(foo5, aa[i][j] = aa[i+1][j+1] * aa[i+1][j+1] / 2, checkaa()) //ok, 1,1
>  TEST(foo6, aa[i][j] = aa[i+1][j] * aa[i+1][j] / 2, checkaa()) //ok, -1,0
> +TEST(foo61, aa[i][0] = aa[i+1][0] * aa[i+1][0] / 2, checkaa()) //notok, -1,0
> +TEST(foo62, aa[i][j/2] = aa[i+1][j/2] * aa[i+1][j/2] / 2, checkaa()) //notok, not affine
> +TEST(foo63, aa[i][j%2] = aa[i+1][j%2] * aa[i+1][j%2] / 2, checkaa()) //notok, not affine
>  TEST(foo7, aa[i+1][j] = aa[i][j] * aa[i][j] / 2, checkaa()) //ok, 1,0
>  TEST(foo9, b[j] = 3*b[j+1] + 1, checkb()) //notok, 0,-1
>  TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0
> +extern int f;
> +TEST(foo11, f = b[i-1] = 1 + 3* b[i+1], checkb()) //ok, 2,0 but must reduce unroll factor to 2, (it would be incorrect with unroll-by-3, which the profitability would suggest)
>
>  /* foo8 should work as well, but currently doesn't because the distance
>     vectors we compute are too pessimistic.  We compute
> @@ -68,6 +73,7 @@ TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0
>     and the last one causes us to lose.  */
>  TEST(foo8, b[j+1] = 3*b[j] + 1, checkb()) //ok, 0,1
>
> +int f;
>  unsigned int a[1024];
>  unsigned int b[1024];
>  unsigned int aa[16][1024];
> @@ -88,10 +94,12 @@ void init(void)
>      printf(" %s\n", #name); \
>      init();for(i=0;i<4;i++)name##noopt(32,8); checka = checksum; \
>      init();for(i=0;i<4;i++)name(32,8); \
> +    if (checka != checksum) fail = 1; \
>      printf("%sok %s\n", checka != checksum ? "NOT " : "", #name);
>
>  int main()
>  {
> +  int fail = 0;
>    int i;
>    unsigned checka;
>    RUN(foo1);
> @@ -100,12 +108,18 @@ int main()
>    RUN(foo4);
>    RUN(foo5);
>    RUN(foo6);
> +  RUN(foo61);
> +  RUN(foo62);
> +  RUN(foo63);
>    RUN(foo7);
>    RUN(foo8);
>    RUN(foo9);
>    RUN(foo10);
> -  return 0;
> +  RUN(foo11);
> +  if (fail)
> +    __builtin_abort();
> +  return fail;
>  }
>
> -/* Five loops should be unroll-jammed (actually six, but see above).  */
> -/* { dg-final { scan-tree-dump-times "applying unroll and jam" 5 "unrolljam" } } */
> +/* Six loops should be unroll-jammed (actually seven, but see above).  */
> +/* { dg-final { scan-tree-dump-times "applying unroll and jam" 6 "unrolljam" } } */
Rainer Orth Oct. 22, 2019, 8:55 p.m. UTC | #2
Hi Michael,

> this was still collecting dust on my disk, so here it is.  See the 
> extensive comment in the patch for what happens, in short invariant IVs 
> are affine but still have to be handled more conservative than other 
> affine IVs in transformations that reorder instructions.  Making our 
> dependence analysis more conservative instead would be too much, we 
> wouldn't be able to handle cases that we should handle anymore.
>
> Regstrapped on x86-64-linux without regressions (all languages+Ada).
>
>
> Ciao,
> Michael.
>
> 	PR middle-end/90796
> 	* gimple-loop-jam.c (any_access_function_variant_p): New function.
> 	(adjust_unroll_factor): Use it to constrain safety, new parameter.
> 	(tree_loop_unroll_and_jam): Adjust call and profitable unroll factor.
>
> testsuite/
> 	* gcc.dg/unroll-and-jam.c: Add three invalid and one valid case.

this testcase now FAILs on 32-bit targets (seen on i386-pc-solaris2.11
and sparc-sun-solaris2.11, also reports for i686-pc-linux-gnu and
i586-unknown-freebsd11.2):

+FAIL: gcc.dg/unroll-and-jam.c scan-tree-dump-times unrolljam "applying unroll and jam" 6

	Rainer
Michael Matz Oct. 23, 2019, 4:02 p.m. UTC | #3
Hello,

On Tue, 22 Oct 2019, Rainer Orth wrote:

> > testsuite/
> > 	* gcc.dg/unroll-and-jam.c: Add three invalid and one valid case.
> 
> this testcase now FAILs on 32-bit targets (seen on i386-pc-solaris2.11
> and sparc-sun-solaris2.11, also reports for i686-pc-linux-gnu and
> i586-unknown-freebsd11.2):
> 
> +FAIL: gcc.dg/unroll-and-jam.c scan-tree-dump-times unrolljam "applying unroll and jam" 6

Hrmpf, I'll have a look :-/  Thanks for noticing.


Ciao,
Michael.
Michael Matz Oct. 28, 2019, 10:31 a.m. UTC | #4
Hi,

On Wed, 23 Oct 2019, Michael Matz wrote:

> On Tue, 22 Oct 2019, Rainer Orth wrote:
> 
> > > testsuite/
> > > 	* gcc.dg/unroll-and-jam.c: Add three invalid and one valid case.
> > 
> > this testcase now FAILs on 32-bit targets (seen on i386-pc-solaris2.11
> > and sparc-sun-solaris2.11, also reports for i686-pc-linux-gnu and
> > i586-unknown-freebsd11.2):
> > 
> > +FAIL: gcc.dg/unroll-and-jam.c scan-tree-dump-times unrolljam "applying unroll and jam" 6
> 
> Hrmpf, I'll have a look :-/  Thanks for noticing.

A strange interaction with LIM, which only does something on 32bit, but 
not on 64bit (which is a problem to investigate on its own, but outside of 
scope).  Disabling it requires changing the testcase so that LIM isn't 
necessary for other reasons, as below.  Can you check if that fixes the 
problem on Solaris as well?  (It does for linux 32bit).


Ciao,
Michael.

testsuite/
	PR middle-end/90796
	* gcc.dg/unroll-and-jam.c: Disable loop-invariant motion.

--- a/gcc/testsuite/gcc.dg/unroll-and-jam.c
+++ b/gcc/testsuite/gcc.dg/unroll-and-jam.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O3 -floop-unroll-and-jam --param unroll-jam-min-percent=0 -fdump-tree-unrolljam-details" } */
+/* { dg-options "-O3 -floop-unroll-and-jam -fno-tree-loop-im --param unroll-jam-min-percent=0 -fdump-tree-unrolljam-details" } */
 /* { dg-require-effective-target int32plus } */
 
 #include <stdio.h>
@@ -31,10 +31,10 @@ void checkb(void)
   //printf("  %d\n", sum);
 }
 
-unsigned i, j;
 #define TEST(name, body, test) \
 static void __attribute__((noinline,noclone)) name (unsigned long n, unsigned long m) \
 { \
+  unsigned i, j; \
   for (i = 1; i < m; i++) { \
       for (j = 1; j < n; j++) { \
 	  body; \
Rainer Orth Oct. 28, 2019, 10:45 a.m. UTC | #5
Hi Michael,

>> On Tue, 22 Oct 2019, Rainer Orth wrote:
>> 
>> > > testsuite/
>> > > 	* gcc.dg/unroll-and-jam.c: Add three invalid and one valid case.
>> > 
>> > this testcase now FAILs on 32-bit targets (seen on i386-pc-solaris2.11
>> > and sparc-sun-solaris2.11, also reports for i686-pc-linux-gnu and
>> > i586-unknown-freebsd11.2):
>> > 
>> > +FAIL: gcc.dg/unroll-and-jam.c scan-tree-dump-times unrolljam "applying
>> > unroll and jam" 6
>> 
>> Hrmpf, I'll have a look :-/  Thanks for noticing.
>
> A strange interaction with LIM, which only does something on 32bit, but 
> not on 64bit (which is a problem to investigate on its own, but outside of 
> scope).  Disabling it requires changing the testcase so that LIM isn't 
> necessary for other reasons, as below.  Can you check if that fixes the 
> problem on Solaris as well?  (It does for linux 32bit).

I just checked: the test now PASSes on both sparc-sun-solaris2.11 and
i386-pc-solaris2.11 (32 and 64-bit each).

Thanks.
        Rainer
Michael Matz Oct. 28, 2019, 10:59 a.m. UTC | #6
Hello,

On Mon, 28 Oct 2019, Rainer Orth wrote:

> >> > +FAIL: gcc.dg/unroll-and-jam.c scan-tree-dump-times unrolljam "applying
> >> > unroll and jam" 6
> >> 
> >> Hrmpf, I'll have a look :-/  Thanks for noticing.
> >
> > A strange interaction with LIM, which only does something on 32bit, but 
> > not on 64bit (which is a problem to investigate on its own, but outside of 
> > scope).  Disabling it requires changing the testcase so that LIM isn't 
> > necessary for other reasons, as below.  Can you check if that fixes the 
> > problem on Solaris as well?  (It does for linux 32bit).
> 
> I just checked: the test now PASSes on both sparc-sun-solaris2.11 and
> i386-pc-solaris2.11 (32 and 64-bit each).

Thank you.  Committed as r277508 now.


Ciao,
Michael.
diff mbox series

Patch

diff --git a/gcc/gimple-loop-jam.c b/gcc/gimple-loop-jam.c
index 11153f5..899653b 100644
--- a/gcc/gimple-loop-jam.c
+++ b/gcc/gimple-loop-jam.c
@@ -360,16 +360,33 @@  fuse_loops (class loop *loop)
   rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
 }
 
+/* Return true if any of the access functions for dataref A
+   isn't invariant with respect to loop LOOP_NEST.  */
+static bool
+any_access_function_variant_p (const struct data_reference *a,
+			       const class loop *loop_nest)
+{
+  unsigned int i;
+  vec<tree> fns = DR_ACCESS_FNS (a);
+  tree t;
+
+  FOR_EACH_VEC_ELT (fns, i, t)
+    if (!evolution_function_is_invariant_p (t, loop_nest->num))
+      return true;
+
+  return false;
+}
+
 /* Returns true if the distance in DDR can be determined and adjusts
    the unroll factor in *UNROLL to make unrolling valid for that distance.
-   Otherwise return false.
+   Otherwise return false.  DDR is with respect to the outer loop of INNER.
 
    If this data dep can lead to a removed memory reference, increment
    *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
    for this to happen.  */
 
 static bool
-adjust_unroll_factor (struct data_dependence_relation *ddr,
+adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr,
 		      unsigned *unroll, unsigned *profit_unroll,
 		      unsigned *removed)
 {
@@ -392,9 +409,59 @@  adjust_unroll_factor (struct data_dependence_relation *ddr,
 	    gcc_unreachable ();
 	  else if ((unsigned)dist >= *unroll)
 	    ;
-	  else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
-		   || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
-		       && dist > 0))
+	  else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
+	    {
+	      /* We have (a,0) with a < N, so this will be transformed into
+	         (0,0) after unrolling by N.  This might potentially be a
+		 problem, if it's not a read-read dependency.  */
+	      if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))
+		;
+	      else
+		{
+		  /* So, at least one is a write, and we might reduce the
+		     distance vector to (0,0).  This is still no problem
+		     if both data-refs are affine with respect to the inner
+		     loops.  But if one of them is invariant with respect
+		     to an inner loop our reordering implicit in loop fusion
+		     corrupts the program, as our data dependences don't
+		     capture this.  E.g. for:
+		       for (0 <= i < n)
+		         for (0 <= j < m)
+		           a[i][0] = a[i+1][0] + 2;    // (1)
+		           b[i][j] = b[i+1][j] + 2;    // (2)
+		     the distance vector for both statements is (-1,0),
+		     but exchanging the order for (2) is okay, while
+		     for (1) it is not.  To see this, write out the original
+		     accesses (assume m is 2):
+		       a i j original
+		       0 0 0 r a[1][0] b[1][0]
+		       1 0 0 w a[0][0] b[0][0]
+		       2 0 1 r a[1][0] b[1][1]
+		       3 0 1 w a[0][0] b[0][1]
+		       4 1 0 r a[2][0] b[2][0]
+		       5 1 0 w a[1][0] b[1][0]
+		     after unroll-by-2 and fusion the accesses are done in
+		     this order (from column a): 0,1, 4,5, 2,3, i.e. this:
+		       a i j transformed
+		       0 0 0 r a[1][0] b[1][0]
+		       1 0 0 w a[0][0] b[0][0]
+		       4 1 0 r a[2][0] b[2][0]
+		       5 1 0 w a[1][0] b[1][0]
+		       2 0 1 r a[1][0] b[1][1]  
+		       3 0 1 w a[0][0] b[0][1]
+		     Note how access 2 accesses the same element as access 5
+		     for array 'a' but not for array 'b'.  */
+		  if (any_access_function_variant_p (DDR_A (ddr), inner)
+		      && any_access_function_variant_p (DDR_B (ddr), inner))
+		    ;
+		  else
+		    /* And if any dataref of this pair is invariant with
+		       respect to the inner loop, we have no chance than
+		       to reduce the unroll factor.  */
+		    *unroll = dist;
+		}
+	    }
+	  else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
 	    ;
 	  else
 	    *unroll = dist;
@@ -486,7 +553,7 @@  tree_loop_unroll_and_jam (void)
 	  /* Now check the distance vector, for determining a sensible
 	     outer unroll factor, and for validity of merging the inner
 	     loop copies.  */
-	  if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
+	  if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll,
 				     &removed))
 	    {
 	      /* Couldn't get the distance vector.  For two reads that's
@@ -506,7 +573,7 @@  tree_loop_unroll_and_jam (void)
 	 to ignore all profitability concerns and apply the transformation
 	 always.  */
       if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
-	profit_unroll = 2;
+	profit_unroll = MAX(2, profit_unroll);
       else if (removed * 100 / datarefs.length ()
 	  < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
 	profit_unroll = 1;
diff --git a/gcc/testsuite/gcc.dg/unroll-and-jam.c b/gcc/testsuite/gcc.dg/unroll-and-jam.c
index 70910d3..bcfe1bd 100644
--- a/gcc/testsuite/gcc.dg/unroll-and-jam.c
+++ b/gcc/testsuite/gcc.dg/unroll-and-jam.c
@@ -31,10 +31,10 @@  void checkb(void)
   //printf("  %d\n", sum);
 }
 
+unsigned i, j;
 #define TEST(name, body, test) \
 static void __attribute__((noinline,noclone)) name (unsigned long n, unsigned long m) \
 { \
-  unsigned long i, j; \
   for (i = 1; i < m; i++) { \
       for (j = 1; j < n; j++) { \
 	  body; \
@@ -58,9 +58,14 @@  TEST(foo3, aa[i+1][j-1]=aa[i][j] * aa[i][j] / 2, checkaa()) //notok, -1,1
 TEST(foo4, aa[i][j] = aa[i-1][j+1] * aa[i-1][j+1] / 2, checkaa()) //notok, -1,1
 TEST(foo5, aa[i][j] = aa[i+1][j+1] * aa[i+1][j+1] / 2, checkaa()) //ok, 1,1
 TEST(foo6, aa[i][j] = aa[i+1][j] * aa[i+1][j] / 2, checkaa()) //ok, -1,0
+TEST(foo61, aa[i][0] = aa[i+1][0] * aa[i+1][0] / 2, checkaa()) //notok, -1,0
+TEST(foo62, aa[i][j/2] = aa[i+1][j/2] * aa[i+1][j/2] / 2, checkaa()) //notok, not affine
+TEST(foo63, aa[i][j%2] = aa[i+1][j%2] * aa[i+1][j%2] / 2, checkaa()) //notok, not affine
 TEST(foo7, aa[i+1][j] = aa[i][j] * aa[i][j] / 2, checkaa()) //ok, 1,0
 TEST(foo9, b[j] = 3*b[j+1] + 1, checkb()) //notok, 0,-1
 TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0
+extern int f;
+TEST(foo11, f = b[i-1] = 1 + 3* b[i+1], checkb()) //ok, 2,0 but must reduce unroll factor to 2, (it would be incorrect with unroll-by-3, which the profitability would suggest)
 
 /* foo8 should work as well, but currently doesn't because the distance
    vectors we compute are too pessimistic.  We compute
@@ -68,6 +73,7 @@  TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0
    and the last one causes us to lose.  */
 TEST(foo8, b[j+1] = 3*b[j] + 1, checkb()) //ok, 0,1
 
+int f;
 unsigned int a[1024];
 unsigned int b[1024];
 unsigned int aa[16][1024];
@@ -88,10 +94,12 @@  void init(void)
     printf(" %s\n", #name); \
     init();for(i=0;i<4;i++)name##noopt(32,8); checka = checksum; \
     init();for(i=0;i<4;i++)name(32,8); \
+    if (checka != checksum) fail = 1; \
     printf("%sok %s\n", checka != checksum ? "NOT " : "", #name);
 
 int main()
 {
+  int fail = 0;
   int i;
   unsigned checka;
   RUN(foo1);
@@ -100,12 +108,18 @@  int main()
   RUN(foo4);
   RUN(foo5);
   RUN(foo6);
+  RUN(foo61);
+  RUN(foo62);
+  RUN(foo63);
   RUN(foo7);
   RUN(foo8);
   RUN(foo9);
   RUN(foo10);
-  return 0;
+  RUN(foo11);
+  if (fail)
+    __builtin_abort();
+  return fail;
 }
 
-/* Five loops should be unroll-jammed (actually six, but see above).  */
-/* { dg-final { scan-tree-dump-times "applying unroll and jam" 5 "unrolljam" } } */
+/* Six loops should be unroll-jammed (actually seven, but see above).  */
+/* { dg-final { scan-tree-dump-times "applying unroll and jam" 6 "unrolljam" } } */