diff mbox series

tree-optimization/99956 - improve loop interchange

Message ID nycvar.YFH.7.76.2104071841180.16468@elmra.sevgm.obk
State New
Headers show
Series tree-optimization/99956 - improve loop interchange | expand

Commit Message

Richard Biener April 7, 2021, 4:42 p.m. UTC
When we apply store motion and DSE manually to the bwaves kernel
in gfortran.dg/pr81303.f loop interchange no longer happens because
the perfect nest considered covers outer loops we cannot analyze
strides for.  The following compensates for this by shrinking the
nest in this analysis which was already possible but on a too coarse
granularity.  It shares the shrinked nest with the rest of the DRs
so the complexity overhead should be negligible.

Bootstrapped & tested on x86_64-unknown-linux-gnu, SPEC FP CPU 2017
build and there's still only the single loop interchange in bwaves.

Queued for stage1.

2021-04-07  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/99956
	* gimple-loop-interchange.cc (compute_access_stride):
	Try instantiating the access in a shallower loop nest
	if instantiating failed.
	(compute_access_strides): Pass adjustable loop_nest
	to compute_access_stride.

	* gfortran.dg/pr99956.f: New testcase.
---
 gcc/gimple-loop-interchange.cc      | 68 +++++++++++++++++------------
 gcc/testsuite/gfortran.dg/pr99956.f | 45 +++++++++++++++++++
 2 files changed, 85 insertions(+), 28 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/pr99956.f

Comments

Richard Biener April 26, 2021, 11:58 a.m. UTC | #1
On Wed, Apr 7, 2021 at 6:43 PM Richard Biener <rguenther@suse.de> wrote:
>
> When we apply store motion and DSE manually to the bwaves kernel
> in gfortran.dg/pr81303.f loop interchange no longer happens because
> the perfect nest considered covers outer loops we cannot analyze
> strides for.  The following compensates for this by shrinking the
> nest in this analysis which was already possible but on a too coarse
> granularity.  It shares the shrinked nest with the rest of the DRs
> so the complexity overhead should be negligible.
>
> Bootstrapped & tested on x86_64-unknown-linux-gnu, SPEC FP CPU 2017
> build and there's still only the single loop interchange in bwaves.
>
> Queued for stage1.

And applied as g:6ff66d1ea48960fe96bb51a750c01135e65fe452

> 2021-04-07  Richard Biener  <rguenther@suse.de>
>
>         PR tree-optimization/99956
>         * gimple-loop-interchange.cc (compute_access_stride):
>         Try instantiating the access in a shallower loop nest
>         if instantiating failed.
>         (compute_access_strides): Pass adjustable loop_nest
>         to compute_access_stride.
>
>         * gfortran.dg/pr99956.f: New testcase.
> ---
>  gcc/gimple-loop-interchange.cc      | 68 +++++++++++++++++------------
>  gcc/testsuite/gfortran.dg/pr99956.f | 45 +++++++++++++++++++
>  2 files changed, 85 insertions(+), 28 deletions(-)
>  create mode 100644 gcc/testsuite/gfortran.dg/pr99956.f
>
> diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-interchange.cc
> index f45b9364644..80f749b6071 100644
> --- a/gcc/gimple-loop-interchange.cc
> +++ b/gcc/gimple-loop-interchange.cc
> @@ -1280,12 +1280,15 @@ tree_loop_interchange::move_code_to_inner_loop (class loop *outer,
>            arr[i][j - 1][k] = 0;  */
>
>  static void
> -compute_access_stride (class loop *loop_nest, class loop *loop,
> +compute_access_stride (class loop *&loop_nest, class loop *loop,
>                        data_reference_p dr)
>  {
>    vec<tree> *strides = new vec<tree> ();
> -  basic_block bb = gimple_bb (DR_STMT (dr));
> +  dr->aux = strides;
>
> +  basic_block bb = gimple_bb (DR_STMT (dr));
> +  if (!flow_bb_inside_loop_p (loop_nest, bb))
> +    return;
>    while (!flow_bb_inside_loop_p (loop, bb))
>      {
>        strides->safe_push (build_int_cst (sizetype, 0));
> @@ -1313,39 +1316,47 @@ compute_access_stride (class loop *loop_nest, class loop *loop,
>         }
>        /* Otherwise punt.  */
>        else
> -       {
> -         dr->aux = strides;
> -         return;
> -       }
> +       return;
>      }
>    tree scev_base = build_fold_addr_expr (ref);
>    tree scev = analyze_scalar_evolution (loop, scev_base);
> -  scev = instantiate_scev (loop_preheader_edge (loop_nest), loop, scev);
> -  if (! chrec_contains_undetermined (scev))
> +  if (chrec_contains_undetermined (scev))
> +    return;
> +
> +  tree orig_scev = scev;
> +  do
> +    {
> +      scev = instantiate_scev (loop_preheader_edge (loop_nest),
> +                              loop, orig_scev);
> +      if (! chrec_contains_undetermined (scev))
> +       break;
> +
> +      /* If we couldn't instantiate for the desired nest, shrink it.  */
> +      if (loop_nest == loop)
> +       return;
> +      loop_nest = loop_nest->inner;
> +    } while (1);
> +
> +  tree sl = scev;
> +  class loop *expected = loop;
> +  while (TREE_CODE (sl) == POLYNOMIAL_CHREC)
>      {
> -      tree sl = scev;
> -      class loop *expected = loop;
> -      while (TREE_CODE (sl) == POLYNOMIAL_CHREC)
> +      class loop *sl_loop = get_chrec_loop (sl);
> +      while (sl_loop != expected)
>         {
> -         class loop *sl_loop = get_chrec_loop (sl);
> -         while (sl_loop != expected)
> -           {
> -             strides->safe_push (size_int (0));
> -             expected = loop_outer (expected);
> -           }
> -         strides->safe_push (CHREC_RIGHT (sl));
> -         sl = CHREC_LEFT (sl);
> +         strides->safe_push (size_int (0));
>           expected = loop_outer (expected);
>         }
> -      if (! tree_contains_chrecs (sl, NULL))
> -       while (expected != loop_outer (loop_nest))
> -         {
> -           strides->safe_push (size_int (0));
> -           expected = loop_outer (expected);
> -         }
> +      strides->safe_push (CHREC_RIGHT (sl));
> +      sl = CHREC_LEFT (sl);
> +      expected = loop_outer (expected);
>      }
> -
> -  dr->aux = strides;
> +  if (! tree_contains_chrecs (sl, NULL))
> +    while (expected != loop_outer (loop_nest))
> +      {
> +       strides->safe_push (size_int (0));
> +       expected = loop_outer (expected);
> +      }
>  }
>
>  /* Given loop nest LOOP_NEST with innermost LOOP, the function computes
> @@ -1363,9 +1374,10 @@ compute_access_strides (class loop *loop_nest, class loop *loop,
>    data_reference_p dr;
>    vec<tree> *stride;
>
> +  class loop *interesting_loop_nest = loop_nest;
>    for (i = 0; datarefs.iterate (i, &dr); ++i)
>      {
> -      compute_access_stride (loop_nest, loop, dr);
> +      compute_access_stride (interesting_loop_nest, loop, dr);
>        stride = DR_ACCESS_STRIDE (dr);
>        if (stride->length () < num_loops)
>         {
> diff --git a/gcc/testsuite/gfortran.dg/pr99956.f b/gcc/testsuite/gfortran.dg/pr99956.f
> new file mode 100644
> index 00000000000..b5c0be3912d
> --- /dev/null
> +++ b/gcc/testsuite/gfortran.dg/pr99956.f
> @@ -0,0 +1,45 @@
> +! { dg-do compile }
> +! { dg-options "-O3 -ffast-math -floop-interchange -fdump-tree-linterchange-details" }
> +
> +        subroutine mat_times_vec(y,x,a,axp,ayp,azp,axm,aym,azm,
> +     $  nb,nx,ny,nz)
> +        implicit none
> +        integer nb,nx,ny,nz,i,j,k,m,l,kit,im1,ip1,jm1,jp1,km1,kp1
> +
> +        real*8 y(nb,nx,ny,nz),x(nb,nx,ny,nz),tem
> +
> +        real*8 a(nb,nb,nx,ny,nz),
> +     1  axp(nb,nb,nx,ny,nz),ayp(nb,nb,nx,ny,nz),azp(nb,nb,nx,ny,nz),
> +     2  axm(nb,nb,nx,ny,nz),aym(nb,nb,nx,ny,nz),azm(nb,nb,nx,ny,nz)
> +
> +
> +      do k=1,nz
> +         km1=mod(k+nz-2,nz)+1
> +         kp1=mod(k,nz)+1
> +         do j=1,ny
> +            jm1=mod(j+ny-2,ny)+1
> +            jp1=mod(j,ny)+1
> +            do i=1,nx
> +               im1=mod(i+nx-2,nx)+1
> +               ip1=mod(i,nx)+1
> +               do l=1,nb
> +                  tem=0.0
> +                  do m=1,nb
> +                     tem=tem+
> +     1               a(l,m,i,j,k)*x(m,i,j,k)+
> +     2               axp(l,m,i,j,k)*x(m,ip1,j,k)+
> +     3               ayp(l,m,i,j,k)*x(m,i,jp1,k)+
> +     4               azp(l,m,i,j,k)*x(m,i,j,kp1)+
> +     5               axm(l,m,i,j,k)*x(m,im1,j,k)+
> +     6               aym(l,m,i,j,k)*x(m,i,jm1,k)+
> +     7               azm(l,m,i,j,k)*x(m,i,j,km1)
> +                  enddo
> +                  y(l,i,j,k)=tem
> +               enddo
> +            enddo
> +         enddo
> +        enddo
> +        return
> +        end
> +
> +! { dg-final { scan-tree-dump-times "is interchanged" 1 "linterchange" } }
> --
> 2.26.2
diff mbox series

Patch

diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-interchange.cc
index f45b9364644..80f749b6071 100644
--- a/gcc/gimple-loop-interchange.cc
+++ b/gcc/gimple-loop-interchange.cc
@@ -1280,12 +1280,15 @@  tree_loop_interchange::move_code_to_inner_loop (class loop *outer,
 	   arr[i][j - 1][k] = 0;  */
 
 static void
-compute_access_stride (class loop *loop_nest, class loop *loop,
+compute_access_stride (class loop *&loop_nest, class loop *loop,
 		       data_reference_p dr)
 {
   vec<tree> *strides = new vec<tree> ();
-  basic_block bb = gimple_bb (DR_STMT (dr));
+  dr->aux = strides;
 
+  basic_block bb = gimple_bb (DR_STMT (dr));
+  if (!flow_bb_inside_loop_p (loop_nest, bb))
+    return;
   while (!flow_bb_inside_loop_p (loop, bb))
     {
       strides->safe_push (build_int_cst (sizetype, 0));
@@ -1313,39 +1316,47 @@  compute_access_stride (class loop *loop_nest, class loop *loop,
 	}
       /* Otherwise punt.  */
       else
-	{
-	  dr->aux = strides;
-	  return;
-	}
+	return;
     }
   tree scev_base = build_fold_addr_expr (ref);
   tree scev = analyze_scalar_evolution (loop, scev_base);
-  scev = instantiate_scev (loop_preheader_edge (loop_nest), loop, scev);
-  if (! chrec_contains_undetermined (scev))
+  if (chrec_contains_undetermined (scev))
+    return;
+
+  tree orig_scev = scev;
+  do
+    {
+      scev = instantiate_scev (loop_preheader_edge (loop_nest),
+			       loop, orig_scev);
+      if (! chrec_contains_undetermined (scev))
+	break;
+
+      /* If we couldn't instantiate for the desired nest, shrink it.  */
+      if (loop_nest == loop)
+	return;
+      loop_nest = loop_nest->inner;
+    } while (1);
+
+  tree sl = scev;
+  class loop *expected = loop;
+  while (TREE_CODE (sl) == POLYNOMIAL_CHREC)
     {
-      tree sl = scev;
-      class loop *expected = loop;
-      while (TREE_CODE (sl) == POLYNOMIAL_CHREC)
+      class loop *sl_loop = get_chrec_loop (sl);
+      while (sl_loop != expected)
 	{
-	  class loop *sl_loop = get_chrec_loop (sl);
-	  while (sl_loop != expected)
-	    {
-	      strides->safe_push (size_int (0));
-	      expected = loop_outer (expected);
-	    }
-	  strides->safe_push (CHREC_RIGHT (sl));
-	  sl = CHREC_LEFT (sl);
+	  strides->safe_push (size_int (0));
 	  expected = loop_outer (expected);
 	}
-      if (! tree_contains_chrecs (sl, NULL))
-	while (expected != loop_outer (loop_nest))
-	  {
-	    strides->safe_push (size_int (0));
-	    expected = loop_outer (expected);
-	  }
+      strides->safe_push (CHREC_RIGHT (sl));
+      sl = CHREC_LEFT (sl);
+      expected = loop_outer (expected);
     }
-
-  dr->aux = strides;
+  if (! tree_contains_chrecs (sl, NULL))
+    while (expected != loop_outer (loop_nest))
+      {
+	strides->safe_push (size_int (0));
+	expected = loop_outer (expected);
+      }
 }
 
 /* Given loop nest LOOP_NEST with innermost LOOP, the function computes
@@ -1363,9 +1374,10 @@  compute_access_strides (class loop *loop_nest, class loop *loop,
   data_reference_p dr;
   vec<tree> *stride;
 
+  class loop *interesting_loop_nest = loop_nest;
   for (i = 0; datarefs.iterate (i, &dr); ++i)
     {
-      compute_access_stride (loop_nest, loop, dr);
+      compute_access_stride (interesting_loop_nest, loop, dr);
       stride = DR_ACCESS_STRIDE (dr);
       if (stride->length () < num_loops)
 	{
diff --git a/gcc/testsuite/gfortran.dg/pr99956.f b/gcc/testsuite/gfortran.dg/pr99956.f
new file mode 100644
index 00000000000..b5c0be3912d
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr99956.f
@@ -0,0 +1,45 @@ 
+! { dg-do compile }
+! { dg-options "-O3 -ffast-math -floop-interchange -fdump-tree-linterchange-details" }
+
+        subroutine mat_times_vec(y,x,a,axp,ayp,azp,axm,aym,azm,
+     $  nb,nx,ny,nz)
+        implicit none
+        integer nb,nx,ny,nz,i,j,k,m,l,kit,im1,ip1,jm1,jp1,km1,kp1
+
+        real*8 y(nb,nx,ny,nz),x(nb,nx,ny,nz),tem
+
+        real*8 a(nb,nb,nx,ny,nz),
+     1  axp(nb,nb,nx,ny,nz),ayp(nb,nb,nx,ny,nz),azp(nb,nb,nx,ny,nz),
+     2  axm(nb,nb,nx,ny,nz),aym(nb,nb,nx,ny,nz),azm(nb,nb,nx,ny,nz)
+
+
+      do k=1,nz
+         km1=mod(k+nz-2,nz)+1
+         kp1=mod(k,nz)+1
+         do j=1,ny
+            jm1=mod(j+ny-2,ny)+1
+            jp1=mod(j,ny)+1
+            do i=1,nx
+               im1=mod(i+nx-2,nx)+1
+               ip1=mod(i,nx)+1
+               do l=1,nb
+                  tem=0.0
+                  do m=1,nb
+                     tem=tem+
+     1               a(l,m,i,j,k)*x(m,i,j,k)+
+     2               axp(l,m,i,j,k)*x(m,ip1,j,k)+
+     3               ayp(l,m,i,j,k)*x(m,i,jp1,k)+
+     4               azp(l,m,i,j,k)*x(m,i,j,kp1)+
+     5               axm(l,m,i,j,k)*x(m,im1,j,k)+
+     6               aym(l,m,i,j,k)*x(m,i,jm1,k)+
+     7               azm(l,m,i,j,k)*x(m,i,j,km1)
+                  enddo
+                  y(l,i,j,k)=tem
+               enddo
+            enddo
+         enddo
+        enddo          
+        return
+        end
+
+! { dg-final { scan-tree-dump-times "is interchanged" 1 "linterchange" } }