diff mbox

[3/3] Vect peeling cost model

Message ID 17f2f208-c2b4-05f9-1822-2be09546fe30@linux.vnet.ibm.com
State New
Headers show

Commit Message

Robin Dapp May 4, 2017, 9:07 a.m. UTC
gcc/ChangeLog:

2017-04-26  Robin Dapp  <rdapp@linux.vnet.ibm.com>

	* tree-vect-data-refs.c (vect_peeling_hash_get_lowest_cost):
	Change cost model.
	(vect_peeling_hash_choose_best_peeling): Return extended peel info.
	(vect_peeling_supportable): Return peeling status.
diff mbox

Patch

diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 7b68582..da49e35 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -904,9 +904,9 @@  vect_compute_data_ref_alignment (struct data_reference *dr)
 
 
 /* Function vect_update_misalignment_for_peel.
-   Sets DR's misalignment
+   Set DR's misalignment
    - to 0 if it has the same alignment as DR_PEEL,
-   - to the misalignment computed using NPEEL if DR's salignment is known,
+   - to the misalignment computed using NPEEL if DR's misalignment is known,
    - to -1 (unknown) otherwise.
 
    DR - the data reference whose misalignment is to be adjusted.
@@ -1293,7 +1293,7 @@  vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
 {
   vect_peel_info elem = *slot;
   int dummy;
-  unsigned int inside_cost = 0, outside_cost = 0, i;
+  unsigned int inside_cost = 0, outside_cost = 0;
   gimple *stmt = DR_STMT (elem->dr);
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
@@ -1342,7 +1342,7 @@  vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
    choosing an option with the lowest cost (if cost model is enabled) or the
    option that aligns as many accesses as possible.  */
 
-static struct data_reference *
+static struct _vect_peel_extended_info
 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
 				       loop_vec_info loop_vinfo,
                                        unsigned int *npeel,
@@ -1369,7 +1369,7 @@  vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_hta
 
    *npeel = res.peel_info.npeel;
    *body_cost_vec = res.body_cost_vec;
-   return res.peel_info.dr;
+   return res;
 }
 
 /* Return true if the new peeling NPEEL is supported.  */
@@ -1518,7 +1518,11 @@  vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   enum dr_alignment_support supportable_dr_alignment;
-  struct data_reference *dr0 = NULL, *first_store = NULL;
+
+  struct data_reference *most_frequent_read = NULL;
+  unsigned int dr_read_count = 0;
+  struct data_reference *most_frequent_write = NULL;
+  unsigned int dr_write_count = 0;
   struct data_reference *dr;
   unsigned int i, j;
   bool do_peeling = false;
@@ -1527,11 +1531,13 @@  vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
   gimple *stmt;
   stmt_vec_info stmt_info;
   unsigned int npeel = 0;
-  bool all_misalignments_unknown = true;
+  bool one_misalignment_known = false;
+  bool one_misalignment_unknown = false;
+
   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   unsigned possible_npeel_number = 1;
   tree vectype;
-  unsigned int nelements, mis, same_align_drs_max = 0;
+  unsigned int nelements, mis;
   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
   hash_table<peel_info_hasher> peeling_htab (1);
 
@@ -1652,57 +1658,67 @@  vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
                   npeel_tmp += nelements;
                 }
 
-              all_misalignments_unknown = false;
-              /* Data-ref that was chosen for the case that all the
-                 misalignments are unknown is not relevant anymore, since we
-                 have a data-ref with known alignment.  */
-              dr0 = NULL;
+	      one_misalignment_known = true;
             }
           else
             {
-              /* If we don't know any misalignment values, we prefer
-                 peeling for data-ref that has the maximum number of data-refs
-                 with the same alignment, unless the target prefers to align
-                 stores over load.  */
-              if (all_misalignments_unknown)
-                {
-		  unsigned same_align_drs
-		    = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
-                  if (!dr0
-		      || same_align_drs_max < same_align_drs)
-                    {
-                      same_align_drs_max = same_align_drs;
-                      dr0 = dr;
-                    }
-		  /* For data-refs with the same number of related
-		     accesses prefer the one where the misalign
-		     computation will be invariant in the outermost loop.  */
-		  else if (same_align_drs_max == same_align_drs)
+	      /* If we don't know any misalignment values, we prefer
+		 peeling for data-ref that has the maximum number of data-refs
+		 with the same alignment, unless the target prefers to align
+		 stores over load.  */
+	      unsigned same_align_dr_count
+		= STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
+
+	      /* For data-refs with the same number of related
+		 accesses prefer the one where the misalign
+		 computation will be invariant in the outermost loop.  */
+	      struct loop *ivloop_max_read = NULL, *ivloop_max_write = NULL,
+			  *ivloop_dr = NULL;
+	      if (most_frequent_read)
+		ivloop_max_read = outermost_invariant_loop_for_expr
+		  (loop, DR_BASE_ADDRESS (most_frequent_read));
+	      if (most_frequent_write)
+		ivloop_max_write = outermost_invariant_loop_for_expr
+		  (loop, DR_BASE_ADDRESS (most_frequent_write));
+	      ivloop_dr = outermost_invariant_loop_for_expr
+		(loop, DR_BASE_ADDRESS (dr));
+
+	      if (DR_IS_READ (dr))
+		{
+		  if (!most_frequent_read
+		      || (same_align_dr_count > dr_read_count))
 		    {
-		      struct loop *ivloop0, *ivloop;
-		      ivloop0 = outermost_invariant_loop_for_expr
-			  (loop, DR_BASE_ADDRESS (dr0));
-		      ivloop = outermost_invariant_loop_for_expr
-			  (loop, DR_BASE_ADDRESS (dr));
-		      if ((ivloop && !ivloop0)
-			  || (ivloop && ivloop0
-			      && flow_loop_nested_p (ivloop, ivloop0)))
-			dr0 = dr;
+		      most_frequent_read = dr;
+		      dr_read_count = same_align_dr_count;
 		    }
+		  else if (same_align_dr_count == dr_read_count
+			   && ((ivloop_dr && !ivloop_max_read)
+			       || (ivloop_dr && ivloop_max_read
+				   && flow_loop_nested_p
+				   (ivloop_dr, ivloop_max_read))))
+		    {
+		      most_frequent_read = dr;
+		    }
+		}
+	      else if (DR_IS_WRITE (dr))
+		{
+		  if (!most_frequent_write
+		      || (same_align_dr_count > dr_write_count))
+		    {
+		      most_frequent_write = dr;
+		      dr_write_count = same_align_dr_count;
+		    }
+		  else if (same_align_dr_count == dr_write_count
+			   && ((ivloop_dr && !ivloop_max_write)
+			       || (ivloop_dr && ivloop_max_write
+				   && flow_loop_nested_p
+				   (ivloop_dr, ivloop_max_write))))
+		    {
+		      most_frequent_write = dr;
+		    }
+		}
 
-                  if (!first_store && DR_IS_WRITE (dr))
-                    first_store = dr;
-                }
-
-              /* If there are both known and unknown misaligned accesses in the
-                 loop, we choose peeling amount according to the known
-                 accesses.  */
-              if (!supportable_dr_alignment)
-                {
-                  dr0 = dr;
-                  if (!first_store && DR_IS_WRITE (dr))
-                    first_store = dr;
-                }
+	      one_misalignment_unknown = true;
             }
         }
       else
@@ -1723,111 +1739,207 @@  vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
       || loop->inner)
     do_peeling = false;
 
-  if (do_peeling
-      && all_misalignments_unknown
-      && vect_supportable_dr_alignment (dr0, false))
+  struct _vect_peel_extended_info best_peel;
+  struct _vect_peel_extended_info peel_for_known_alignment;
+  peel_for_known_alignment.peel_info.dr = NULL;
+  struct _vect_peel_extended_info peel_for_unknown_alignment;
+  peel_for_unknown_alignment.peel_info.dr = NULL;
+
+  if (do_peeling && one_misalignment_known)
     {
-      /* Check if the target requires to prefer stores over loads, i.e., if
-         misaligned stores are more expensive than misaligned loads (taking
-         drs with same alignment into account).  */
-      if (first_store && DR_IS_READ (dr0))
-        {
-          unsigned int load_inside_cost = 0, load_outside_cost = 0;
-          unsigned int store_inside_cost = 0, store_outside_cost = 0;
-          unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
-          unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
+      /* Peeling is possible, but there is no data access that is not supported
+	 unless aligned. So we try to choose the best possible peeling.  */
+
+      /* Choose the best peeling from the hash table.  */
+      peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
+	(&peeling_htab, loop_vinfo, &npeel, &body_cost_vec);
+    }
+
+  if (do_peeling && one_misalignment_unknown)
+    {
+      /* Calculate the costs for aligning MOST_FREQUENT_READ, potentially
+         leaving everything else misaligned.  */
+      unsigned int align_mf_read_inside_cost = 0;
+      unsigned int align_mf_read_outside_cost = 0;
+
+      if (most_frequent_read)
+	{
 	  stmt_vector_for_cost dummy;
 	  dummy.create (2);
+	  vect_get_peeling_costs_all_drs (most_frequent_read,
+					  &align_mf_read_inside_cost,
+					  &align_mf_read_outside_cost,
+					  &dummy, vf / 2, vf);
+	  dummy.release();
+	}
+      else
+	{
+	  align_mf_read_inside_cost = UINT_MAX;
+	  align_mf_read_outside_cost = UINT_MAX;
+	}
 
-          vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
-				     &dummy);
-          vect_get_data_access_cost (first_store, &store_inside_cost,
-				     &store_outside_cost, &dummy);
+      /* Calculate the costs for aligning MOST_FREQUENT_WRITE, potentially
+         leaving everything else misaligned.  */
+      unsigned int align_mf_write_inside_cost = 0;
+      unsigned int align_mf_write_outside_cost = 0;
 
+      if (most_frequent_write)
+	{
+	  stmt_vector_for_cost dummy;
+	  dummy.create (2);
+	  vect_get_peeling_costs_all_drs (most_frequent_write,
+					  &align_mf_write_inside_cost,
+					  &align_mf_write_outside_cost,
+					  &dummy, vf / 2, vf);
 	  dummy.release ();
+	}
+      else
+	{
+	  align_mf_write_inside_cost = UINT_MAX;
+	  align_mf_write_outside_cost = UINT_MAX;
+	}
 
-          /* Calculate the penalty for leaving FIRST_STORE unaligned (by
-             aligning the load DR0).  */
-          load_inside_penalty = store_inside_cost;
-          load_outside_penalty = store_outside_cost;
-          for (i = 0;
-	       STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
-			  DR_STMT (first_store))).iterate (i, &dr);
-               i++)
-            if (DR_IS_READ (dr))
-              {
-                load_inside_penalty += load_inside_cost;
-                load_outside_penalty += load_outside_cost;
-              }
-            else
-              {
-                load_inside_penalty += store_inside_cost;
-                load_outside_penalty += store_outside_cost;
-              }
-
-          /* Calculate the penalty for leaving DR0 unaligned (by
-             aligning the FIRST_STORE).  */
-          store_inside_penalty = load_inside_cost;
-          store_outside_penalty = load_outside_cost;
-          for (i = 0;
-	       STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
-		      DR_STMT (dr0))).iterate (i, &dr);
-               i++)
-            if (DR_IS_READ (dr))
-              {
-                store_inside_penalty += load_inside_cost;
-                store_outside_penalty += load_outside_cost;
-              }
-            else
-              {
-                store_inside_penalty += store_inside_cost;
-                store_outside_penalty += store_outside_cost;
-              }
-
-          if (load_inside_penalty > store_inside_penalty
-              || (load_inside_penalty == store_inside_penalty
-                  && load_outside_penalty > store_outside_penalty))
-            dr0 = first_store;
-        }
+      /* Choose best peeling according to given load and store peeling
+	 costs.  */
+      if (align_mf_read_inside_cost > align_mf_write_inside_cost
+	  || (align_mf_read_inside_cost == align_mf_write_inside_cost
+	      && align_mf_read_outside_cost > align_mf_write_outside_cost))
+	{
+	  peel_for_unknown_alignment.peel_info.dr = most_frequent_write;
+	  peel_for_unknown_alignment.peel_info.count =
+	    1 + STMT_VINFO_SAME_ALIGN_REFS
+	    (vinfo_for_stmt (DR_STMT (most_frequent_write))).length ();
+	  peel_for_unknown_alignment.inside_cost = align_mf_write_inside_cost;
+	  peel_for_unknown_alignment.outside_cost =
+	    align_mf_write_outside_cost;
+	}
+      else
+	{
+	  if (most_frequent_read)
+	    {
+	      peel_for_unknown_alignment.peel_info.dr = most_frequent_read;
+	      peel_for_unknown_alignment.peel_info.count =
+		1 + STMT_VINFO_SAME_ALIGN_REFS
+		(vinfo_for_stmt (DR_STMT (most_frequent_read))).length ();
+	    }
+	  else
+	    {
+	      peel_for_unknown_alignment.peel_info.dr = most_frequent_write;
+	      peel_for_unknown_alignment.peel_info.count =
+		1 + STMT_VINFO_SAME_ALIGN_REFS
+		(vinfo_for_stmt (DR_STMT (most_frequent_write))).length ();
+	    }
+	  peel_for_unknown_alignment.inside_cost = align_mf_read_inside_cost;
+	  peel_for_unknown_alignment.outside_cost =
+	    align_mf_read_outside_cost;
+	}
+
+      /* Add prologue and epilogue costs.  */
+      stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
+      prologue_cost_vec.create (2);
+      epilogue_cost_vec.create (2);
+
+      int dummy;
+      peel_for_unknown_alignment.outside_cost
+	+= vect_get_known_peeling_cost (loop_vinfo, vf / 2, &dummy,
+	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
+	 &prologue_cost_vec, &epilogue_cost_vec);
 
-      /* In case there are only loads with different unknown misalignments, use
-         peeling only if it may help to align other accesses in the loop or
-	 if it may help improving load bandwith when we'd end up using
-	 unaligned loads.  */
-      tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
-      if (!first_store
-	  && !STMT_VINFO_SAME_ALIGN_REFS (
-		  vinfo_for_stmt (DR_STMT (dr0))).length ()
-	  && (vect_supportable_dr_alignment (dr0, false)
-	      != dr_unaligned_supported
-	      || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
-		  == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
-        do_peeling = false;
+      prologue_cost_vec.release ();
+      epilogue_cost_vec.release ();
+
+      /* The code below expects npeel == 0 when we plan to peel vf/2
+	 iterations, so do not set npeel = vf/2 here.  */
+      peel_for_unknown_alignment.peel_info.npeel = 0;
     }
 
-  if (do_peeling && !dr0)
+  /* At this point, we have to choose between peeling for the datarefs with
+     known alignment and the ones with unknown alignment.  Prefer the one
+     that aligns more datarefs in total.  */
+  struct data_reference *dr0 = NULL;
+  if (do_peeling)
     {
-      /* Peeling is possible, but there is no data access that is not supported
-         unless aligned. So we try to choose the best possible peeling.  */
+      bool peel_for_unknown_alignment_valid =
+	peel_for_unknown_alignment.peel_info.dr != NULL;
+      bool peel_for_known_alignment_valid =
+	peel_for_known_alignment.peel_info.dr != NULL;
 
-      /* We should get here only if there are drs with known misalignment.  */
-      gcc_assert (!all_misalignments_unknown);
+      gcc_assert (peel_for_known_alignment_valid
+		  || peel_for_unknown_alignment_valid);
 
-      /* Choose the best peeling from the hash table.  */
-      dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
-						   loop_vinfo, &npeel,
-						   &body_cost_vec);
-      if (!dr0 || !npeel)
-        do_peeling = false;
+      if (peel_for_known_alignment_valid && !peel_for_unknown_alignment_valid)
+	best_peel = peel_for_known_alignment;
+
+      else if
+	(!peel_for_known_alignment_valid && peel_for_unknown_alignment_valid)
+	best_peel = peel_for_unknown_alignment;
+
+      else
+	{
+	  /* Choose the best peeling for known and unknown alignment
+	     according to the number of aligned datarefs.  */
+	  if (peel_for_unknown_alignment.peel_info.count
+	      > peel_for_known_alignment.peel_info.count)
+	    best_peel = peel_for_unknown_alignment;
+	  else
+	    best_peel = peel_for_known_alignment;
+	}
     }
 
+  /* Calculate the penalty for no peeling, i.e. leaving everything
+     unaligned.
+     TODO: use something like an adapted vect_get_peeling_costs_all_drs.  */
+  unsigned nopeel_inside_cost = 0;
+  unsigned nopeel_outside_cost = 0;
+
+  stmt_vector_for_cost dummy;
+  dummy.create (2);
+  FOR_EACH_VEC_ELT (datarefs, i, dr)
+    {
+      vect_get_data_access_cost (dr, &nopeel_inside_cost,
+				 &nopeel_outside_cost, &dummy);
+    }
+  dummy.release ();
+
+  /* Add epilogue costs.  As we do no peeling for alignment here, no prologue
+     costs will be recorded.  */
+  stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
+  prologue_cost_vec.create (2);
+  epilogue_cost_vec.create (2);
+
+  int dummy2;
+  nopeel_outside_cost += vect_get_known_peeling_cost
+    (loop_vinfo, 0, &dummy2,
+     &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
+     &prologue_cost_vec, &epilogue_cost_vec);
+
+  prologue_cost_vec.release ();
+  epilogue_cost_vec.release ();
+
+
+  /* Check if doing no peeling is not more expensive than the best peeling we
+     have so far.  */
+  if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))
+      && ((nopeel_inside_cost < best_peel.inside_cost)
+	  || (nopeel_inside_cost == best_peel.inside_cost
+	  && nopeel_outside_cost <= best_peel.outside_cost)))
+    {
+      do_peeling = false;
+      npeel = 0;
+    }
+
+
   if (do_peeling)
     {
+      dr0 = best_peel.peel_info.dr;
+      npeel = best_peel.peel_info.npeel;
+
       stmt = DR_STMT (dr0);
       stmt_info = vinfo_for_stmt (stmt);
       vectype = STMT_VINFO_VECTYPE (stmt_info);
       nelements = TYPE_VECTOR_SUBPARTS (vectype);
 
+      /* Define a peeling if not already set and log it.  */
       if (known_alignment_for_access_p (dr0))
         {
 	  bool negative = DR_HAS_NEGATIVE_STEP (dr0);