diff mbox series

Try fix PR90911

Message ID alpine.LSU.2.20.1906251350380.10704@zhemvz.fhfr.qr
State New
Headers show
Series Try fix PR90911 | expand

Commit Message

Richard Biener June 25, 2019, 12:59 p.m. UTC
PR90911 reports a slowdown of 456.hmmer with the recent introduction
of vectorizer versioning of outer loops, more specifically the case
of re-using if-conversion created versions.

The patch below fixes things up to adjust the edge probability
and scale the loop bodies in two steps, delaying scalar_loop
scaling until all peeling is done.  This restores profile-mismatches
to the same state as it was on the GCC 9 branch and seems to
fix the observed slowdown of 456.hmmer.

Boostrap & regtest running on x86_64-unknown-linux-gnu.

Honza, does this look OK?

Thanks,
Richard.

2019-06-25  Richard Biener  <rguenther@suse.de>

	* tree-vectorizer.h (_loop_vec_info::scalar_loop_scaling): New field.
	(LOOP_VINFO_SCALAR_LOOP_SCALING): new.
	* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize
	scalar_loop_scaling.
	(vect_transform_loop): Scale scalar loop profile if needed.
	* tree-vect-loop-manip.c (vect_loop_versioning): When re-using
	the loop copy from if-conversion adjust edge probabilities
	and scale the vectorized loop body profile, queue the scalar
	profile for updating after peeling.

Comments

Richard Biener July 3, 2019, 10:51 a.m. UTC | #1
On Tue, 25 Jun 2019, Richard Biener wrote:

> 
> PR90911 reports a slowdown of 456.hmmer with the recent introduction
> of vectorizer versioning of outer loops, more specifically the case
> of re-using if-conversion created versions.
> 
> The patch below fixes things up to adjust the edge probability
> and scale the loop bodies in two steps, delaying scalar_loop
> scaling until all peeling is done.  This restores profile-mismatches
> to the same state as it was on the GCC 9 branch and seems to
> fix the observed slowdown of 456.hmmer.
> 
> Boostrap & regtest running on x86_64-unknown-linux-gnu.
> 
> Honza, does this look OK?

Ping.

> Thanks,
> Richard.
> 
> 2019-06-25  Richard Biener  <rguenther@suse.de>
> 
> 	* tree-vectorizer.h (_loop_vec_info::scalar_loop_scaling): New field.
> 	(LOOP_VINFO_SCALAR_LOOP_SCALING): new.
> 	* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize
> 	scalar_loop_scaling.
> 	(vect_transform_loop): Scale scalar loop profile if needed.
> 	* tree-vect-loop-manip.c (vect_loop_versioning): When re-using
> 	the loop copy from if-conversion adjust edge probabilities
> 	and scale the vectorized loop body profile, queue the scalar
> 	profile for updating after peeling.
> 
> Index: gcc/tree-vectorizer.h
> ===================================================================
> --- gcc/tree-vectorizer.h	(revision 272636)
> +++ gcc/tree-vectorizer.h	(working copy)
> @@ -548,6 +548,9 @@ typedef struct _loop_vec_info : public v
>    /* Mark loops having masked stores.  */
>    bool has_mask_store;
>  
> +  /* Queued scaling factor for the scalar loop.  */
> +  profile_probability scalar_loop_scaling;
> +
>    /* If if-conversion versioned this loop before conversion, this is the
>       loop version without if-conversion.  */
>    struct loop *scalar_loop;
> @@ -603,6 +606,7 @@ typedef struct _loop_vec_info : public v
>  #define LOOP_VINFO_PEELING_FOR_NITER(L)    (L)->peeling_for_niter
>  #define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies
>  #define LOOP_VINFO_SCALAR_LOOP(L)	   (L)->scalar_loop
> +#define LOOP_VINFO_SCALAR_LOOP_SCALING(L)  (L)->scalar_loop_scaling
>  #define LOOP_VINFO_HAS_MASK_STORE(L)       (L)->has_mask_store
>  #define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec
>  #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost
> Index: gcc/tree-vect-loop.c
> ===================================================================
> --- gcc/tree-vect-loop.c	(revision 272636)
> +++ gcc/tree-vect-loop.c	(working copy)
> @@ -835,6 +835,7 @@ _loop_vec_info::_loop_vec_info (struct l
>      operands_swapped (false),
>      no_data_dependencies (false),
>      has_mask_store (false),
> +    scalar_loop_scaling (profile_probability::uninitialized ()),
>      scalar_loop (NULL),
>      orig_loop_info (NULL)
>  {
> @@ -8562,6 +8563,10 @@ vect_transform_loop (loop_vec_info loop_
>    epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
>  			      &step_vector, &niters_vector_mult_vf, th,
>  			      check_profitability, niters_no_overflow);
> +  if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
> +      && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
> +    scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
> +			    LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
>  
>    if (niters_vector == NULL_TREE)
>      {
> Index: gcc/tree-vect-loop-manip.c
> ===================================================================
> --- gcc/tree-vect-loop-manip.c	(revision 272636)
> +++ gcc/tree-vect-loop-manip.c	(working copy)
> @@ -3114,8 +3114,17 @@ vect_loop_versioning (loop_vec_info loop
>  				 GSI_SAME_STMT);
>  	}
>  
> -      /* ???  if-conversion uses profile_probability::always () but
> -         prob below is profile_probability::likely ().  */
> +      /* if-conversion uses profile_probability::always () for both paths,
> +	 reset the paths probabilities appropriately.  */
> +      edge te, fe;
> +      extract_true_false_edges_from_block (condition_bb, &te, &fe);
> +      te->probability = prob;
> +      fe->probability = prob.invert ();
> +      /* We can scale loops counts immediately but have to postpone
> +         scaling the scalar loop because we re-use it during peeling.  */
> +      scale_loop_frequencies (loop_to_version, prob);
> +      LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = prob.invert ();
> +
>        nloop = scalar_loop;
>        if (dump_enabled_p ())
>  	dump_printf_loc (MSG_NOTE, vect_location,
>
Jan Hubicka July 4, 2019, 12:19 p.m. UTC | #2
> On Tue, 25 Jun 2019, Richard Biener wrote:
> 
> > 
> > PR90911 reports a slowdown of 456.hmmer with the recent introduction
> > of vectorizer versioning of outer loops, more specifically the case
> > of re-using if-conversion created versions.
> > 
> > The patch below fixes things up to adjust the edge probability
> > and scale the loop bodies in two steps, delaying scalar_loop
> > scaling until all peeling is done.  This restores profile-mismatches
> > to the same state as it was on the GCC 9 branch and seems to
> > fix the observed slowdown of 456.hmmer.
> > 
> > Boostrap & regtest running on x86_64-unknown-linux-gnu.
> > 
> > Honza, does this look OK?
> 
> Ping.
Sorry for taking time - i meant to look into what is happening here.
hmmer works faster with your patch w/o PGO however we now have 20%
improvement on PGO runs this week (as seen in LNT)
https://lnt.opensuse.org/db_default/v4/SPEC/graph?plot.0=7.180.0
So the long standing issue of hmmer with PGO seems to be related to this
update.
> 
> > Thanks,
> > Richard.
> > 
> > 2019-06-25  Richard Biener  <rguenther@suse.de>
> > 
> > 	* tree-vectorizer.h (_loop_vec_info::scalar_loop_scaling): New field.
> > 	(LOOP_VINFO_SCALAR_LOOP_SCALING): new.
> > 	* tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize
> > 	scalar_loop_scaling.
> > 	(vect_transform_loop): Scale scalar loop profile if needed.
> > 	* tree-vect-loop-manip.c (vect_loop_versioning): When re-using
> > 	the loop copy from if-conversion adjust edge probabilities
> > 	and scale the vectorized loop body profile, queue the scalar
> > 	profile for updating after peeling.
> > 
> > Index: gcc/tree-vectorizer.h
> > ===================================================================
> > --- gcc/tree-vectorizer.h	(revision 272636)
> > +++ gcc/tree-vectorizer.h	(working copy)
> > @@ -548,6 +548,9 @@ typedef struct _loop_vec_info : public v
> >    /* Mark loops having masked stores.  */
> >    bool has_mask_store;
> >  
> > +  /* Queued scaling factor for the scalar loop.  */
> > +  profile_probability scalar_loop_scaling;
> > +
> >    /* If if-conversion versioned this loop before conversion, this is the
> >       loop version without if-conversion.  */
> >    struct loop *scalar_loop;
> > @@ -603,6 +606,7 @@ typedef struct _loop_vec_info : public v
> >  #define LOOP_VINFO_PEELING_FOR_NITER(L)    (L)->peeling_for_niter
> >  #define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies
> >  #define LOOP_VINFO_SCALAR_LOOP(L)	   (L)->scalar_loop
> > +#define LOOP_VINFO_SCALAR_LOOP_SCALING(L)  (L)->scalar_loop_scaling
> >  #define LOOP_VINFO_HAS_MASK_STORE(L)       (L)->has_mask_store
> >  #define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec
> >  #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost
> > Index: gcc/tree-vect-loop.c
> > ===================================================================
> > --- gcc/tree-vect-loop.c	(revision 272636)
> > +++ gcc/tree-vect-loop.c	(working copy)
> > @@ -835,6 +835,7 @@ _loop_vec_info::_loop_vec_info (struct l
> >      operands_swapped (false),
> >      no_data_dependencies (false),
> >      has_mask_store (false),
> > +    scalar_loop_scaling (profile_probability::uninitialized ()),
> >      scalar_loop (NULL),
> >      orig_loop_info (NULL)
> >  {
> > @@ -8562,6 +8563,10 @@ vect_transform_loop (loop_vec_info loop_
> >    epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
> >  			      &step_vector, &niters_vector_mult_vf, th,
> >  			      check_profitability, niters_no_overflow);
> > +  if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
> > +      && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
> > +    scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
> > +			    LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
> >  
> >    if (niters_vector == NULL_TREE)
> >      {
> > Index: gcc/tree-vect-loop-manip.c
> > ===================================================================
> > --- gcc/tree-vect-loop-manip.c	(revision 272636)
> > +++ gcc/tree-vect-loop-manip.c	(working copy)
> > @@ -3114,8 +3114,17 @@ vect_loop_versioning (loop_vec_info loop
> >  				 GSI_SAME_STMT);
> >  	}
> >  
> > -      /* ???  if-conversion uses profile_probability::always () but
> > -         prob below is profile_probability::likely ().  */
> > +      /* if-conversion uses profile_probability::always () for both paths,
> > +	 reset the paths probabilities appropriately.  */
> > +      edge te, fe;
> > +      extract_true_false_edges_from_block (condition_bb, &te, &fe);
> > +      te->probability = prob;
> > +      fe->probability = prob.invert ();
> > +      /* We can scale loops counts immediately but have to postpone
> > +         scaling the scalar loop because we re-use it during peeling.  */
> > +      scale_loop_frequencies (loop_to_version, prob);
> > +      LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = prob.invert ();

Perhaps using te->probability/fe->probability would make it more obvious
what is going on (and save one inversion).

Patch is OK.
Since the PGO runs only once a week, it would be great if you could
check whether hmmer works well with PGO now.

Honza
diff mbox series

Patch

Index: gcc/tree-vectorizer.h
===================================================================
--- gcc/tree-vectorizer.h	(revision 272636)
+++ gcc/tree-vectorizer.h	(working copy)
@@ -548,6 +548,9 @@  typedef struct _loop_vec_info : public v
   /* Mark loops having masked stores.  */
   bool has_mask_store;
 
+  /* Queued scaling factor for the scalar loop.  */
+  profile_probability scalar_loop_scaling;
+
   /* If if-conversion versioned this loop before conversion, this is the
      loop version without if-conversion.  */
   struct loop *scalar_loop;
@@ -603,6 +606,7 @@  typedef struct _loop_vec_info : public v
 #define LOOP_VINFO_PEELING_FOR_NITER(L)    (L)->peeling_for_niter
 #define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies
 #define LOOP_VINFO_SCALAR_LOOP(L)	   (L)->scalar_loop
+#define LOOP_VINFO_SCALAR_LOOP_SCALING(L)  (L)->scalar_loop_scaling
 #define LOOP_VINFO_HAS_MASK_STORE(L)       (L)->has_mask_store
 #define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec
 #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost
Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c	(revision 272636)
+++ gcc/tree-vect-loop.c	(working copy)
@@ -835,6 +835,7 @@  _loop_vec_info::_loop_vec_info (struct l
     operands_swapped (false),
     no_data_dependencies (false),
     has_mask_store (false),
+    scalar_loop_scaling (profile_probability::uninitialized ()),
     scalar_loop (NULL),
     orig_loop_info (NULL)
 {
@@ -8562,6 +8563,10 @@  vect_transform_loop (loop_vec_info loop_
   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
 			      &step_vector, &niters_vector_mult_vf, th,
 			      check_profitability, niters_no_overflow);
+  if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
+      && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
+    scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
+			    LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
 
   if (niters_vector == NULL_TREE)
     {
Index: gcc/tree-vect-loop-manip.c
===================================================================
--- gcc/tree-vect-loop-manip.c	(revision 272636)
+++ gcc/tree-vect-loop-manip.c	(working copy)
@@ -3114,8 +3114,17 @@  vect_loop_versioning (loop_vec_info loop
 				 GSI_SAME_STMT);
 	}
 
-      /* ???  if-conversion uses profile_probability::always () but
-         prob below is profile_probability::likely ().  */
+      /* if-conversion uses profile_probability::always () for both paths,
+	 reset the paths probabilities appropriately.  */
+      edge te, fe;
+      extract_true_false_edges_from_block (condition_bb, &te, &fe);
+      te->probability = prob;
+      fe->probability = prob.invert ();
+      /* We can scale loops counts immediately but have to postpone
+         scaling the scalar loop because we re-use it during peeling.  */
+      scale_loop_frequencies (loop_to_version, prob);
+      LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = prob.invert ();
+
       nloop = scalar_loop;
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,