diff mbox

Fix PR65701(?), fix typo in vect_enhance_data_refs_alignment

Message ID alpine.LSU.2.11.1505221105210.30088@zhemvz.fhfr.qr
State New
Headers show

Commit Message

Richard Biener May 22, 2015, 9:06 a.m. UTC
On Fri, 10 Apr 2015, Richard Biener wrote:

> On Fri, 10 Apr 2015, Richard Biener wrote:
> 
> > 
> > The following patch fixes a typo (I think) which is present since the
> > original introduction of the code in vect_enhance_data_refs_alignment.
> > 
> >   if (do_peeling
> >       && all_misalignments_unknown
> >       && vect_supportable_dr_alignment (dr0, false))
> >     {
> >       /* Check if the target requires to prefer stores over loads, i.e., 
> > if
> >          misaligned stores are more expensive than misaligned loads 
> > (taking
> >          drs with same alignment into account).  */
> >       if (first_store && DR_IS_READ (dr0))
> >         {
> > ...
> >         }
> > 
> >       /* In case there are only loads with different unknown 
> > misalignments, use
> >          peeling only if it may help to align other accesses in the loop.  
> > */
> >       if (!first_store
> >           && !STMT_VINFO_SAME_ALIGN_REFS (
> >                   vinfo_for_stmt (DR_STMT (dr0))).length ()
> >           && vect_supportable_dr_alignment (dr0, false)
> >               != dr_unaligned_supported)
> >         do_peeling = false;
> >     }
> > 
> > the last vect_supportable_dr_alignment check doesn't make much sense.
> > It's not mentioned in the comment and I see no reason for treating
> > dr_unaligned_supported any different here compared to
> > dr_explicit_realign_[optimized].
> 
> Ok, as always a second after you hit <send> you think of the reason
> of this test.  I suppose the idea was that aligning a single load
> might improve load bandwith (same reason we eventually prefer
> aligning stores).  dr_explicit_realign_[optimized] perform
> aligned loads thus it makes sense to special-case them.
> I suppose the cost model in that case should be more precise
> (and note that for bdver2 aligned and unaligned loads have the
> same cost).
> 
> So sth like
> 
>       /* In case there are only loads with different unknown 
> misalignments, use
>          peeling only if it may help to align other accesses in the loop 
> or
>          if it may help improving load bandwith when we'd end up using
>          unaligned loads.  */
>       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
>       if (!first_store
>           && !STMT_VINFO_SAME_ALIGN_REFS (
>                   vinfo_for_stmt (DR_STMT (dr0))).length ()
>           && (vect_supportable_dr_alignment (dr0, false)
>               != dr_unaligned_supported
>               || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
>                   == builtin_vectorization_cost (unaligned_load, dr0_vt, 
> -1))))
>         do_peeling = false;
> 
> Looks like all Intel CPUs still have larger unaligned load cost
> compared to aligned load cost (same is true for bdver2 in principle -
> the cost isn't about the instruction encoding but the actual
> data alignment).  So it might be artificially fixing 187.facerec with
> bdver2 (we don't have a good idea whether we'll going to be
> store or load bandwith limited in the vectorizer).

Still the following is what I have bootstrapped and tested
on x86_64-unknown-linux-gnu and applied to trunk.

Richard.

2015-05-22  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/65701
	* tree-vect-data-refs.c (vect_enhance_data_refs_alignment):
	Move peeling cost models into one place.  Peel for alignment
	for single loads only if an aligned load is cheaper than
	an unaligned load.
diff mbox

Patch

Index: gcc/tree-vect-data-refs.c
===================================================================
--- gcc/tree-vect-data-refs.c	(revision 223349)
+++ gcc/tree-vect-data-refs.c	(working copy)
@@ -1541,16 +1541,6 @@  vect_enhance_data_refs_alignment (loop_v
       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
     do_peeling = false;
 
-  /* If we don't know how many times the peeling loop will run
-     assume it will run VF-1 times and disable peeling if the remaining
-     iters are less than the vectorization factor.  */
-  if (do_peeling
-      && all_misalignments_unknown
-      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && (LOOP_VINFO_INT_NITERS (loop_vinfo)
-	  < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1))
-    do_peeling = false;
-
   if (do_peeling
       && all_misalignments_unknown
       && vect_supportable_dr_alignment (dr0, false))
@@ -1619,12 +1609,17 @@  vect_enhance_data_refs_alignment (loop_v
         }
 
       /* In case there are only loads with different unknown misalignments, use
-         peeling only if it may help to align other accesses in the loop.  */
+         peeling only if it may help to align other accesses in the loop or
+	 if it may help improving load bandwith when we'd end up using
+	 unaligned loads.  */
+      tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
       if (!first_store
 	  && !STMT_VINFO_SAME_ALIGN_REFS (
 		  vinfo_for_stmt (DR_STMT (dr0))).length ()
-          && vect_supportable_dr_alignment (dr0, false)
-              != dr_unaligned_supported)
+	  && (vect_supportable_dr_alignment (dr0, false)
+	      != dr_unaligned_supported
+	      || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
+		  == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
         do_peeling = false;
     }
 
@@ -1641,14 +1636,6 @@  vect_enhance_data_refs_alignment (loop_v
 						   &body_cost_vec);
       if (!dr0 || !npeel)
         do_peeling = false;
-
-      /* If peeling by npeel will result in a remaining loop not iterating
-         enough to be vectorized then do not peel.  */
-      if (do_peeling
-	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	  && (LOOP_VINFO_INT_NITERS (loop_vinfo)
-	      < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel))
-	do_peeling = false;
     }
 
   if (do_peeling)
@@ -1733,6 +1720,7 @@  vect_enhance_data_refs_alignment (loop_v
 	    }
         }
 
+      /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
       if (do_peeling)
         {
           unsigned max_allowed_peel
@@ -1757,6 +1745,18 @@  vect_enhance_data_refs_alignment (loop_v
             }
         }
 
+      /* Cost model #2 - if peeling may result in a remaining loop not
+	 iterating enough to be vectorized then do not peel.  */
+      if (do_peeling
+	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+	{
+	  unsigned max_peel
+	    = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
+	  if (LOOP_VINFO_INT_NITERS (loop_vinfo)
+	      < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
+	    do_peeling = false;
+	}
+
       if (do_peeling)
         {
           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.