Message ID | alpine.LSU.2.20.1906251350380.10704@zhemvz.fhfr.qr |
---|---|
State | New |
Headers | show |
Series | Try fix PR90911 | expand |
On Tue, 25 Jun 2019, Richard Biener wrote: > > PR90911 reports a slowdown of 456.hmmer with the recent introduction > of vectorizer versioning of outer loops, more specifically the case > of re-using if-conversion created versions. > > The patch below fixes things up to adjust the edge probability > and scale the loop bodies in two steps, delaying scalar_loop > scaling until all peeling is done. This restores profile-mismatches > to the same state as it was on the GCC 9 branch and seems to > fix the observed slowdown of 456.hmmer. > > Boostrap & regtest running on x86_64-unknown-linux-gnu. > > Honza, does this look OK? Ping. > Thanks, > Richard. > > 2019-06-25 Richard Biener <rguenther@suse.de> > > * tree-vectorizer.h (_loop_vec_info::scalar_loop_scaling): New field. > (LOOP_VINFO_SCALAR_LOOP_SCALING): new. > * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize > scalar_loop_scaling. > (vect_transform_loop): Scale scalar loop profile if needed. > * tree-vect-loop-manip.c (vect_loop_versioning): When re-using > the loop copy from if-conversion adjust edge probabilities > and scale the vectorized loop body profile, queue the scalar > profile for updating after peeling. > > Index: gcc/tree-vectorizer.h > =================================================================== > --- gcc/tree-vectorizer.h (revision 272636) > +++ gcc/tree-vectorizer.h (working copy) > @@ -548,6 +548,9 @@ typedef struct _loop_vec_info : public v > /* Mark loops having masked stores. */ > bool has_mask_store; > > + /* Queued scaling factor for the scalar loop. */ > + profile_probability scalar_loop_scaling; > + > /* If if-conversion versioned this loop before conversion, this is the > loop version without if-conversion. */ > struct loop *scalar_loop; > @@ -603,6 +606,7 @@ typedef struct _loop_vec_info : public v > #define LOOP_VINFO_PEELING_FOR_NITER(L) (L)->peeling_for_niter > #define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies > #define LOOP_VINFO_SCALAR_LOOP(L) (L)->scalar_loop > +#define LOOP_VINFO_SCALAR_LOOP_SCALING(L) (L)->scalar_loop_scaling > #define LOOP_VINFO_HAS_MASK_STORE(L) (L)->has_mask_store > #define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec > #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost > Index: gcc/tree-vect-loop.c > =================================================================== > --- gcc/tree-vect-loop.c (revision 272636) > +++ gcc/tree-vect-loop.c (working copy) > @@ -835,6 +835,7 @@ _loop_vec_info::_loop_vec_info (struct l > operands_swapped (false), > no_data_dependencies (false), > has_mask_store (false), > + scalar_loop_scaling (profile_probability::uninitialized ()), > scalar_loop (NULL), > orig_loop_info (NULL) > { > @@ -8562,6 +8563,10 @@ vect_transform_loop (loop_vec_info loop_ > epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, > &step_vector, &niters_vector_mult_vf, th, > check_profitability, niters_no_overflow); > + if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo) > + && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ()) > + scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo), > + LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo)); > > if (niters_vector == NULL_TREE) > { > Index: gcc/tree-vect-loop-manip.c > =================================================================== > --- gcc/tree-vect-loop-manip.c (revision 272636) > +++ gcc/tree-vect-loop-manip.c (working copy) > @@ -3114,8 +3114,17 @@ vect_loop_versioning (loop_vec_info loop > GSI_SAME_STMT); > } > > - /* ??? if-conversion uses profile_probability::always () but > - prob below is profile_probability::likely (). */ > + /* if-conversion uses profile_probability::always () for both paths, > + reset the paths probabilities appropriately. */ > + edge te, fe; > + extract_true_false_edges_from_block (condition_bb, &te, &fe); > + te->probability = prob; > + fe->probability = prob.invert (); > + /* We can scale loops counts immediately but have to postpone > + scaling the scalar loop because we re-use it during peeling. */ > + scale_loop_frequencies (loop_to_version, prob); > + LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = prob.invert (); > + > nloop = scalar_loop; > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, >
> On Tue, 25 Jun 2019, Richard Biener wrote: > > > > > PR90911 reports a slowdown of 456.hmmer with the recent introduction > > of vectorizer versioning of outer loops, more specifically the case > > of re-using if-conversion created versions. > > > > The patch below fixes things up to adjust the edge probability > > and scale the loop bodies in two steps, delaying scalar_loop > > scaling until all peeling is done. This restores profile-mismatches > > to the same state as it was on the GCC 9 branch and seems to > > fix the observed slowdown of 456.hmmer. > > > > Boostrap & regtest running on x86_64-unknown-linux-gnu. > > > > Honza, does this look OK? > > Ping. Sorry for taking time - i meant to look into what is happening here. hmmer works faster with your patch w/o PGO however we now have 20% improvement on PGO runs this week (as seen in LNT) https://lnt.opensuse.org/db_default/v4/SPEC/graph?plot.0=7.180.0 So the long standing issue of hmmer with PGO seems to be related to this update. > > > Thanks, > > Richard. > > > > 2019-06-25 Richard Biener <rguenther@suse.de> > > > > * tree-vectorizer.h (_loop_vec_info::scalar_loop_scaling): New field. > > (LOOP_VINFO_SCALAR_LOOP_SCALING): new. > > * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize > > scalar_loop_scaling. > > (vect_transform_loop): Scale scalar loop profile if needed. > > * tree-vect-loop-manip.c (vect_loop_versioning): When re-using > > the loop copy from if-conversion adjust edge probabilities > > and scale the vectorized loop body profile, queue the scalar > > profile for updating after peeling. > > > > Index: gcc/tree-vectorizer.h > > =================================================================== > > --- gcc/tree-vectorizer.h (revision 272636) > > +++ gcc/tree-vectorizer.h (working copy) > > @@ -548,6 +548,9 @@ typedef struct _loop_vec_info : public v > > /* Mark loops having masked stores. */ > > bool has_mask_store; > > > > + /* Queued scaling factor for the scalar loop. */ > > + profile_probability scalar_loop_scaling; > > + > > /* If if-conversion versioned this loop before conversion, this is the > > loop version without if-conversion. */ > > struct loop *scalar_loop; > > @@ -603,6 +606,7 @@ typedef struct _loop_vec_info : public v > > #define LOOP_VINFO_PEELING_FOR_NITER(L) (L)->peeling_for_niter > > #define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies > > #define LOOP_VINFO_SCALAR_LOOP(L) (L)->scalar_loop > > +#define LOOP_VINFO_SCALAR_LOOP_SCALING(L) (L)->scalar_loop_scaling > > #define LOOP_VINFO_HAS_MASK_STORE(L) (L)->has_mask_store > > #define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec > > #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost > > Index: gcc/tree-vect-loop.c > > =================================================================== > > --- gcc/tree-vect-loop.c (revision 272636) > > +++ gcc/tree-vect-loop.c (working copy) > > @@ -835,6 +835,7 @@ _loop_vec_info::_loop_vec_info (struct l > > operands_swapped (false), > > no_data_dependencies (false), > > has_mask_store (false), > > + scalar_loop_scaling (profile_probability::uninitialized ()), > > scalar_loop (NULL), > > orig_loop_info (NULL) > > { > > @@ -8562,6 +8563,10 @@ vect_transform_loop (loop_vec_info loop_ > > epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, > > &step_vector, &niters_vector_mult_vf, th, > > check_profitability, niters_no_overflow); > > + if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo) > > + && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ()) > > + scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo), > > + LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo)); > > > > if (niters_vector == NULL_TREE) > > { > > Index: gcc/tree-vect-loop-manip.c > > =================================================================== > > --- gcc/tree-vect-loop-manip.c (revision 272636) > > +++ gcc/tree-vect-loop-manip.c (working copy) > > @@ -3114,8 +3114,17 @@ vect_loop_versioning (loop_vec_info loop > > GSI_SAME_STMT); > > } > > > > - /* ??? if-conversion uses profile_probability::always () but > > - prob below is profile_probability::likely (). */ > > + /* if-conversion uses profile_probability::always () for both paths, > > + reset the paths probabilities appropriately. */ > > + edge te, fe; > > + extract_true_false_edges_from_block (condition_bb, &te, &fe); > > + te->probability = prob; > > + fe->probability = prob.invert (); > > + /* We can scale loops counts immediately but have to postpone > > + scaling the scalar loop because we re-use it during peeling. */ > > + scale_loop_frequencies (loop_to_version, prob); > > + LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = prob.invert (); Perhaps using te->probability/fe->probability would make it more obvious what is going on (and save one inversion). Patch is OK. Since the PGO runs only once a week, it would be great if you could check whether hmmer works well with PGO now. Honza
Index: gcc/tree-vectorizer.h =================================================================== --- gcc/tree-vectorizer.h (revision 272636) +++ gcc/tree-vectorizer.h (working copy) @@ -548,6 +548,9 @@ typedef struct _loop_vec_info : public v /* Mark loops having masked stores. */ bool has_mask_store; + /* Queued scaling factor for the scalar loop. */ + profile_probability scalar_loop_scaling; + /* If if-conversion versioned this loop before conversion, this is the loop version without if-conversion. */ struct loop *scalar_loop; @@ -603,6 +606,7 @@ typedef struct _loop_vec_info : public v #define LOOP_VINFO_PEELING_FOR_NITER(L) (L)->peeling_for_niter #define LOOP_VINFO_NO_DATA_DEPENDENCIES(L) (L)->no_data_dependencies #define LOOP_VINFO_SCALAR_LOOP(L) (L)->scalar_loop +#define LOOP_VINFO_SCALAR_LOOP_SCALING(L) (L)->scalar_loop_scaling #define LOOP_VINFO_HAS_MASK_STORE(L) (L)->has_mask_store #define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost Index: gcc/tree-vect-loop.c =================================================================== --- gcc/tree-vect-loop.c (revision 272636) +++ gcc/tree-vect-loop.c (working copy) @@ -835,6 +835,7 @@ _loop_vec_info::_loop_vec_info (struct l operands_swapped (false), no_data_dependencies (false), has_mask_store (false), + scalar_loop_scaling (profile_probability::uninitialized ()), scalar_loop (NULL), orig_loop_info (NULL) { @@ -8562,6 +8563,10 @@ vect_transform_loop (loop_vec_info loop_ epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, &step_vector, &niters_vector_mult_vf, th, check_profitability, niters_no_overflow); + if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo) + && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ()) + scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo), + LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo)); if (niters_vector == NULL_TREE) { Index: gcc/tree-vect-loop-manip.c =================================================================== --- gcc/tree-vect-loop-manip.c (revision 272636) +++ gcc/tree-vect-loop-manip.c (working copy) @@ -3114,8 +3114,17 @@ vect_loop_versioning (loop_vec_info loop GSI_SAME_STMT); } - /* ??? if-conversion uses profile_probability::always () but - prob below is profile_probability::likely (). */ + /* if-conversion uses profile_probability::always () for both paths, + reset the paths probabilities appropriately. */ + edge te, fe; + extract_true_false_edges_from_block (condition_bb, &te, &fe); + te->probability = prob; + fe->probability = prob.invert (); + /* We can scale loops counts immediately but have to postpone + scaling the scalar loop because we re-use it during peeling. */ + scale_loop_frequencies (loop_to_version, prob); + LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = prob.invert (); + nloop = scalar_loop; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location,