diff mbox series

Fix profile update after peeled epilogues

Message ID ZNAB3BD7KOYK4iiJ@kam.mff.cuni.cz
State New
Headers show
Series Fix profile update after peeled epilogues | expand

Commit Message

Jan Hubicka Aug. 6, 2023, 8:26 p.m. UTC
Hi,
Epilogue peeling expects the scalar loop to have same number of executions as
the vector loop which is true at the beggining of vectorization. However if the
epilogues are vectorized, this is no longer the case.  In this situation the
loop preheader is replaced by new guard code with correct profile, however
loop body is left unscaled.  This leads to loop that exists more often then
it is entered.

This patch add slogic to scale the frequencies down and also to fix profile
of original preheader where necesary.

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

	* tree-vect-loop-manip.cc (vect_do_peeling): Fix profile update of peeled epilogues.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-bitfield-read-1.c: Check profile consistency.
	* gcc.dg/vect/vect-bitfield-read-2.c: Check profile consistency.
	* gcc.dg/vect/vect-bitfield-read-3.c: Check profile consistency.
	* gcc.dg/vect/vect-bitfield-read-4.c: Check profile consistency.
	* gcc.dg/vect/vect-bitfield-read-5.c: Check profile consistency.
	* gcc.dg/vect/vect-bitfield-read-6.c: Check profile consistency.
	* gcc.dg/vect/vect-bitfield-read-7.c: Check profile consistency.
	* gcc.dg/vect/vect-bitfield-write-1.c: Check profile consistency.
	* gcc.dg/vect/vect-bitfield-write-2.c: Check profile consistency.
	* gcc.dg/vect/vect-bitfield-write-3.c: Check profile consistency.
	* gcc.dg/vect/vect-bitfield-write-4.c: Check profile consistency.
	* gcc.dg/vect/vect-bitfield-write-5.c: Check profile consistency.
	* gcc.dg/vect/vect-epilogues-2.c: Check profile consistency.
	* gcc.dg/vect/vect-epilogues.c: Check profile consistency.
	* gcc.dg/vect/vect-mask-store-move-1.c: Check profile consistency.
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c
index 42e50d9f0c8..147c959568d 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-1.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_shift } */
 
@@ -39,3 +40,4 @@  int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c
index a9aeefcd72c..982e6a7967b 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-2.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_shift } */
 /* { dg-require-effective-target vect_long_long } */
 
@@ -42,3 +43,4 @@  int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c
index c7d0fd26bad..f2a43c39f50 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-3.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_shift } */
 
@@ -43,3 +44,4 @@  int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-4.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-4.c
index 6a3ed8c0c6f..9f6f0220664 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-4.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_shift } */
 /* { dg-require-effective-target vect_long_long } */
 
@@ -44,3 +45,4 @@  int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-5.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-5.c
index b2889df8a0a..662aed104cf 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-5.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-5.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_shift } */
 
@@ -41,3 +42,4 @@  int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-6.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-6.c
index 2445f531be2..9b315d6be86 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-6.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-6.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_shift } */
 
@@ -41,3 +42,4 @@  int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-7.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-7.c
index 4b1ec8a6dab..6d1043dd971 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-7.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-read-7.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_shift } */
 
@@ -42,3 +43,4 @@  int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-1.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-1.c
index 22e62353014..7c710cf5a57 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-1.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
@@ -38,3 +39,4 @@  int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-2.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-2.c
index 0c8291c9363..3b609183c54 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-2.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_long_long } */
 
@@ -42,3 +43,4 @@  int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-3.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-3.c
index 46fcb02b2f1..e96da82c214 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-3.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-3.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_long_long } */
 
@@ -43,3 +44,4 @@  int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-4.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-4.c
index 5a7227a93e4..66442213c9f 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-4.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-4.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
@@ -41,3 +42,4 @@  int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-5.c b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-5.c
index e0b36e411a4..386de504aad 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-5.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-bitfield-write-5.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-require-effective-target vect_int } */
 
 #include <stdarg.h>
@@ -41,3 +42,4 @@  int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-epilogues-2.c b/gcc/testsuite/gcc.dg/vect/vect-epilogues-2.c
index b251e1f2dfd..63c5e231f85 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-epilogues-2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-epilogues-2.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-do compile } */
 
 int
@@ -55,3 +56,4 @@  f6 (int *x, int a)
   x[a] += 1;
   return res;
 }
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-epilogues.c b/gcc/testsuite/gcc.dg/vect/vect-epilogues.c
index ab7e8a1a759..11b8c83b7ba 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-epilogues.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-epilogues.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-do compile } */
 
 /* Copied from PR 88915.  */
@@ -17,3 +18,4 @@  void pixel_avg( unsigned char *dst, int i_dst_stride,
  }
 
 /* { dg-final { scan-tree-dump "LOOP EPILOGUE VECTORIZED" "vect" { target vect_multiple_sizes xfail { { arm32 && be } || vect_partial_vectors_usage_2 } } } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c b/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c
index 1e06b588c0f..700adf9e1d4 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-mask-store-move-1.c
@@ -1,3 +1,4 @@ 
+/* { dg-additional-options "-fdump-tree-optimized-details-blocks" } */
 /* { dg-do compile } */
 /* { dg-additional-options "-mavx2" { target { i?86-*-* x86_64-*-* } } } */
 
@@ -16,3 +17,4 @@  void foo (int n)
 }
 
 /* { dg-final { scan-tree-dump-times "Move stmt to created bb" 4 "vect" { target { i?86-*-* x86_64-*-* } xfail { i?86-*-* x86_64-*-* } } } } */
+/* { dg-final { scan-tree-dump-not "Invalid sum" "optimized" } } */
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 9de897d05a5..0e7e223f22a 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -3271,6 +3271,7 @@  vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
       adjust_vec_debug_stmts ();
       scev_reset ();
     }
+  basic_block bb_before_epilog = NULL;
 
   if (epilog_peeling)
     {
@@ -3290,6 +3291,7 @@  vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 
       epilog->force_vectorize = false;
       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
+      bb_before_epilog = loop_preheader_edge (epilog)->src;
 
       /* Scalar version loop may be preferred.  In this case, add guard
 	 and skip to epilog.  Note this only happens when the number of
@@ -3317,6 +3319,7 @@  vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 
 	  /* Simply propagate profile info from guard_bb to guard_to which is
 	     a merge point of control flow.  */
+	  profile_count old_count = guard_to->count;
 	  guard_to->count = guard_bb->count;
 
 	  /* Restore the counts of the epilog loop if we didn't use the scalar loop. */
@@ -3332,9 +3335,15 @@  vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
 	      free (bbs);
 	      free (original_bbs);
 	    }
-	}
+	  else
+	    scale_loop_profile (epilog, guard_to->count.probability_in (old_count), -1);
 
-      basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
+	  /* Only need to handle basic block before epilog loop if it's not
+	     the guard_bb, which is the case when skip_vector is true.  */
+	  if (guard_bb != bb_before_epilog)
+	    bb_before_epilog->count = single_pred_edge (bb_before_epilog)->count ();
+	  bb_before_epilog = loop_preheader_edge (epilog)->src;
+	}
       /* If loop is peeled for non-zero constant times, now niters refers to
 	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
 	 overflows.  */