diff mbox series

Fix profile update after loop-ch and cunroll

Message ID ZKbYkVxNoKnkW0Px@kam.mff.cuni.cz
State New
Headers show
Series Fix profile update after loop-ch and cunroll | expand

Commit Message

Jan Hubicka July 6, 2023, 3:06 p.m. UTC
Hi,
this patch makes loop-ch and loop unrolling to fix profile in case the loop is
known to not iterate at all (or iterate few times) while profile claims it
iterates more.  While this is kind of symptomatic fix, it is best we can do
incase profile was originally esitmated incorrectly.

In the testcase the problematic loop is produced by vectorizer and I think
vectorizer should know and account into its costs that vectorizer loop and/or
epilogue is not going to loop after the transformation.  So it would be nice
to fix it on that side, too.

The patch avoids about half of profile mismatches caused by cunroll.

Pass dump id and name            |static mismatcdynamic mismatch
                                 |in count     |in count
107t cunrolli                    |      3    +3|        17251   +17251
115t threadfull                  |      3      |        14376    -2875
116t vrp                         |      5    +2|        30908   +16532
117t dse                         |      5      |        30908
118t dce                         |      3    -2|        17251   -13657
127t ch                          |     13   +10|        17251
131t dom                         |     39   +26|        17251
133t isolate-paths               |     47    +8|        17251
134t reassoc                     |     49    +2|        17251
136t forwprop                    |     53    +4|       202501  +185250
159t cddce                       |     61    +8|       216211   +13710
161t ldist                       |     62    +1|       216211
172t ifcvt                       |     66    +4|       373711  +157500
173t vect                        |    143   +77|      9802097 +9428386
176t cunroll                     |    221   +78|     15639591 +5837494
183t loopdone                    |    218    -3|     15577640   -61951
195t fre                         |    214    -4|     15577640
197t dom                         |    213    -1|     16671606 +1093966
199t threadfull                  |    215    +2|     16879581  +207975
200t vrp                         |    217    +2|     17077750  +198169
204t dce                         |    215    -2|     17004486   -73264
206t sink                        |    213    -2|     17004486
211t cddce                       |    219    +6|     17005926    +1440
255t optimized                   |    217    -2|     17005926
256r expand                      |    210    -7|     19571573 +2565647
258r into_cfglayout              |    208    -2|     19571573
275r loop2_unroll                |    212    +4|     22992432 +3420859
291r ce2                         |    210    -2|     23011838
312r pro_and_epilogue            |    230   +20|     23073776   +61938
315r jump2                       |    236    +6|     27110534 +4036758
323r bbro                        |    229    -7|     21826835 -5283699


W/o the patch cunroll does:

176t cunroll                     |    294  +151|126548439   +116746342

and we end up with 291 mismatches at bbro.

Bootstrapped/regtested x86_64-linux. Plan to commit it after the scale_loop_frequency patch.

gcc/ChangeLog:

	PR middle-end/25623
	* tree-ssa-loop-ch.cc (ch_base::copy_headers): Scale loop frequency to maximal number
	of iterations determined.
	* tree-ssa-loop-ivcanon.cc (try_unroll_loop_completely): Likewise.

gcc/testsuite/ChangeLog:

	PR middle-end/25623
	* gfortran.dg/pr25623-2.f90: New test.
diff mbox series

Patch

diff --git a/gcc/testsuite/gfortran.dg/pr25623-2.f90 b/gcc/testsuite/gfortran.dg/pr25623-2.f90
new file mode 100644
index 00000000000..57679e0d6ed
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr25623-2.f90
@@ -0,0 +1,19 @@ 
+! { dg-do compile }
+! { dg-options "-fdump-tree-optimized-blocks -O3" }
+
+SUBROUTINE S42(a,b,c,N)
+ IMPLICIT NONE
+ integer :: N
+ real*8  :: a(N),b(N),c(N),tmp,tmp2,tmp4
+ real*8, parameter :: p=1.0D0/3.0D0
+ integer :: i
+ c=0.0D0
+ DO i=1,N
+   tmp=a(i)**p ! could even be done with a cube root
+   tmp2=tmp*tmp
+   tmp4=tmp2*tmp2
+   b(i)=b(i)+tmp4
+   c(i)=c(i)+tmp2
+ ENDDO
+END SUBROUTINE
+! { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } }
diff --git a/gcc/tree-ssa-loop-ch.cc b/gcc/tree-ssa-loop-ch.cc
index 291f2dbcab9..72792cec21f 100644
--- a/gcc/tree-ssa-loop-ch.cc
+++ b/gcc/tree-ssa-loop-ch.cc
@@ -422,6 +422,7 @@  ch_base::copy_headers (function *fun)
 	{
 	  if (dump_file && (dump_flags & TDF_DETAILS))
 	    fprintf (dump_file, "Loop %d never loops.\n", loop->num);
+	  scale_loop_profile (loop, profile_probability::always (), 0);
 	  loops_to_unloop.safe_push (loop);
 	  loops_to_unloop_nunroll.safe_push (0);
 	  continue;
@@ -666,6 +667,7 @@  ch_base::copy_headers (function *fun)
 	{
 	  if (dump_file && (dump_flags & TDF_DETAILS))
 	    fprintf (dump_file, "Loop %d no longer loops.\n", loop->num);
+	  scale_loop_profile (loop, profile_probability::always (), 0);
 	  loops_to_unloop.safe_push (loop);
 	  loops_to_unloop_nunroll.safe_push (0);
 	}
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index 491b57ec0f1..184c08eec75 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -906,6 +906,10 @@  try_unroll_loop_completely (class loop *loop,
       if (may_be_zero)
 	bitmap_clear_bit (wont_exit, 1);
 
+      /* If loop was originally estimated to iterate too many times,
+         reduce the profile to avoid new profile inconsistencies.  */
+      scale_loop_profile (loop, profile_probability::always (), n_unroll);
+
       if (!gimple_duplicate_loop_body_to_header_edge (
 	    loop, loop_preheader_edge (loop), n_unroll, wont_exit, exit,
 	    &edges_to_remove,
@@ -919,6 +923,8 @@  try_unroll_loop_completely (class loop *loop,
 
       free_original_copy_tables ();
     }
+  else
+    scale_loop_profile (loop, profile_probability::always (), 0);
 
   /* Remove the conditional from the last copy of the loop.  */
   if (edge_to_cancel)