diff mbox series

Move code out of vect_slp_analyze_bb_1

Message ID mpt5zkkn5a7.fsf@arm.com
State New
Headers show
Series Move code out of vect_slp_analyze_bb_1 | expand

Commit Message

Richard Sandiford Oct. 19, 2019, 3:06 p.m. UTC
After the previous patch, it seems more natural to apply the
PARAM_SLP_MAX_INSNS_IN_BB threshold as soon as we know what
the region is, rather than delaying it to vect_slp_analyze_bb_1.
(But rather than carve out the biggest region possible and then
reject it, wouldn't it be better to stop when the region gets
too big, so we at least have a chance of vectorising something?)

It also seems more natural for vect_slp_bb_region to create the
bb_vec_info itself rather than (a) having to pass bits of data down
for the initialisation and (b) forcing vect_slp_analyze_bb_1 to free
on every failure return.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Richard


2019-10-19  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* tree-vect-slp.c (vect_slp_analyze_bb_1): Take a bb_vec_info
	and return a boolean success value.  Move the allocation and
	initialization of the bb_vec_info to...
	(vect_slp_bb_region): ...here.  Update call accordingly.
	(vect_slp_bb): Apply PARAM_SLP_MAX_INSNS_IN_BB here rather
	than in vect_slp_analyze_bb_1.

Comments

Richard Biener Oct. 19, 2019, 8 p.m. UTC | #1
On October 19, 2019 5:06:40 PM GMT+02:00, Richard Sandiford <richard.sandiford@arm.com> wrote:
>After the previous patch, it seems more natural to apply the
>PARAM_SLP_MAX_INSNS_IN_BB threshold as soon as we know what
>the region is, rather than delaying it to vect_slp_analyze_bb_1.
>(But rather than carve out the biggest region possible and then
>reject it, wouldn't it be better to stop when the region gets
>too big, so we at least have a chance of vectorising something?)
>
>It also seems more natural for vect_slp_bb_region to create the
>bb_vec_info itself rather than (a) having to pass bits of data down
>for the initialisation and (b) forcing vect_slp_analyze_bb_1 to free
>on every failure return.
>
>Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Ok. But I wonder what the reason was for this limit? Dependency analysis was greatly simplified, being no longer quadratic here. Can you check where the limit originally came from? But indeed splitting the region makes more sense then, but at dataref group boundaries I'd say. 

Richard. 

>Richard
>
>
>2019-10-19  Richard Sandiford  <richard.sandiford@arm.com>
>
>gcc/
>	* tree-vect-slp.c (vect_slp_analyze_bb_1): Take a bb_vec_info
>	and return a boolean success value.  Move the allocation and
>	initialization of the bb_vec_info to...
>	(vect_slp_bb_region): ...here.  Update call accordingly.
>	(vect_slp_bb): Apply PARAM_SLP_MAX_INSNS_IN_BB here rather
>	than in vect_slp_analyze_bb_1.
>
>Index: gcc/tree-vect-slp.c
>===================================================================
>--- gcc/tree-vect-slp.c	2019-10-19 15:59:53.875379473 +0100
>+++ gcc/tree-vect-slp.c	2019-10-19 16:01:13.714824484 +0100
>@@ -2836,47 +2836,24 @@ vect_bb_vectorization_profitable_p (bb_v
>   return true;
> }
> 
>-/* Check if the basic block can be vectorized.  Returns a bb_vec_info
>-   if so and sets fatal to true if failure is independent of
>-   current_vector_size.  */
>-
>-static bb_vec_info
>-vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
>-		       gimple_stmt_iterator region_end,
>-		       vec<data_reference_p> datarefs, int n_stmts,
>-		       bool &fatal, vec_info_shared *shared)
>+/* Check if the region described by BB_VINFO can be vectorized,
>returning
>+   true if so.  When returning false, set FATAL to true if the same
>failure
>+   would prevent vectorization at other vector sizes, false if it is
>still
>+   worth trying other sizes.  N_STMTS is the number of statements in
>the
>+   region.  */
>+
>+static bool
>+vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
> {
>   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
> 
>-  bb_vec_info bb_vinfo;
>   slp_instance instance;
>   int i;
>   poly_uint64 min_vf = 2;
>-  bool first_time_p = shared->datarefs.is_empty ();
> 
>   /* The first group of checks is independent of the vector size.  */
>   fatal = true;
> 
>-  if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
>-    {
>-      if (dump_enabled_p ())
>-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>-			 "not vectorized: too many instructions in "
>-			 "basic block.\n");
>-      free_data_refs (datarefs);
>-      return NULL;
>-    }
>-
>-  bb_vinfo = new _bb_vec_info (region_begin, region_end, shared);
>-  if (!bb_vinfo)
>-    return NULL;
>-
>-  BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
>-  if (first_time_p)
>-    bb_vinfo->shared->save_datarefs ();
>-  else
>-    bb_vinfo->shared->check_datarefs ();
>-
>   /* Analyze the data references.  */
> 
>   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
>@@ -2885,9 +2862,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_itera
>         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> 			 "not vectorized: unhandled data-ref in basic "
> 			 "block.\n");
>-
>-      delete bb_vinfo;
>-      return NULL;
>+      return false;
>     }
> 
>   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
>@@ -2896,9 +2871,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_itera
>         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> 			 "not vectorized: not enough data-refs in "
> 			 "basic block.\n");
>-
>-      delete bb_vinfo;
>-      return NULL;
>+      return false;
>     }
> 
>   if (!vect_analyze_data_ref_accesses (bb_vinfo))
>@@ -2907,9 +2880,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_itera
>        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> 			"not vectorized: unhandled data access in "
> 			"basic block.\n");
>-
>-      delete bb_vinfo;
>-      return NULL;
>+      return false;
>     }
> 
>   /* If there are no grouped stores in the region there is no need
>@@ -2921,9 +2892,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_itera
> 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> 			 "not vectorized: no grouped stores in "
> 			 "basic block.\n");
>-
>-      delete bb_vinfo;
>-      return NULL;
>+      return false;
>     }
> 
> /* While the rest of the analysis below depends on it in some way.  */
>@@ -2943,9 +2912,7 @@ vect_slp_analyze_bb_1 (gimple_stmt_itera
> 			   "not vectorized: failed to find SLP opportunities "
> 			   "in basic block.\n");
> 	}
>-
>-      delete bb_vinfo;
>-      return NULL;
>+      return false;
>     }
> 
>   vect_record_base_alignments (bb_vinfo);
>@@ -2976,19 +2943,14 @@ vect_slp_analyze_bb_1 (gimple_stmt_itera
>       i++;
>     }
>   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
>-    {
>-      delete bb_vinfo;
>-      return NULL;
>-    }
>+    return false;
> 
>   if (!vect_slp_analyze_operations (bb_vinfo))
>     {
>       if (dump_enabled_p ())
>         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> 			 "not vectorized: bad operation in basic block.\n");
>-
>-      delete bb_vinfo;
>-      return NULL;
>+      return false;
>     }
> 
>   /* Cost model: check if the vectorization is worthwhile.  */
>@@ -2999,16 +2961,13 @@ vect_slp_analyze_bb_1 (gimple_stmt_itera
>         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> 			 "not vectorized: vectorization is not "
> 			 "profitable.\n");
>-
>-      delete bb_vinfo;
>-      return NULL;
>+      return false;
>     }
> 
>   if (dump_enabled_p ())
>     dump_printf_loc (MSG_NOTE, vect_location,
> 		     "Basic block will be vectorized using SLP\n");
>-
>-  return bb_vinfo;
>+  return true;
> }
> 
> /* Subroutine of vect_slp_bb.  Try to vectorize the statements between
>@@ -3037,9 +2996,16 @@ vect_slp_bb_region (gimple_stmt_iterator
>     {
>       bool vectorized = false;
>       bool fatal = false;
>-      bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
>-					datarefs, n_stmts, fatal, &shared);
>-      if (bb_vinfo
>+      bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
>+
>+      bool first_time_p = shared.datarefs.is_empty ();
>+      BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
>+      if (first_time_p)
>+	bb_vinfo->shared->save_datarefs ();
>+      else
>+	bb_vinfo->shared->check_datarefs ();
>+
>+      if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
> 	  && dbg_cnt (vect_slp))
> 	{
> 	  if (dump_enabled_p ())
>@@ -3132,7 +3098,14 @@ vect_slp_bb (basic_block bb)
> 
>       gimple_stmt_iterator region_end = gsi;
> 
>-      if (vect_slp_bb_region (region_begin, region_end, datarefs,
>insns))
>+      if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
>+	{
>+	  if (dump_enabled_p ())
>+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>+			     "not vectorized: too many instructions in "
>+			     "basic block.\n");
>+	}
>+      else if (vect_slp_bb_region (region_begin, region_end, datarefs,
>insns))
> 	any_vectorized = true;
> 
>       if (gsi_end_p (region_end))
Richard Sandiford Oct. 20, 2019, 12:54 p.m. UTC | #2
Richard Biener <richard.guenther@gmail.com> writes:
> On October 19, 2019 5:06:40 PM GMT+02:00, Richard Sandiford <richard.sandiford@arm.com> wrote:
>>After the previous patch, it seems more natural to apply the
>>PARAM_SLP_MAX_INSNS_IN_BB threshold as soon as we know what
>>the region is, rather than delaying it to vect_slp_analyze_bb_1.
>>(But rather than carve out the biggest region possible and then
>>reject it, wouldn't it be better to stop when the region gets
>>too big, so we at least have a chance of vectorising something?)
>>
>>It also seems more natural for vect_slp_bb_region to create the
>>bb_vec_info itself rather than (a) having to pass bits of data down
>>for the initialisation and (b) forcing vect_slp_analyze_bb_1 to free
>>on every failure return.
>>
>>Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
>
> Ok. But I wonder what the reason was for this limit? Dependency analysis was greatly simplified, being no longer quadratic here. Can you check where the limit originally came from? But indeed splitting the region makes more sense then, but at dataref group boundaries I'd say. 

Yeah, looks it was the complexity of dependence analysis:

  https://gcc.gnu.org/ml/gcc-patches/2009-05/msg01303.html

  > Is there any limit on the size of the BB you consider for
  > vectorization?  I see we do compute_all_dependences on it - that
  > might be quite costly.

  I added slp-max-insns-in-bb parameter with initial value 1000.

Thanks,
Richard
Richard Biener Oct. 21, 2019, 6 a.m. UTC | #3
On October 20, 2019 2:54:48 PM GMT+02:00, Richard Sandiford <richard.sandiford@arm.com> wrote:
>Richard Biener <richard.guenther@gmail.com> writes:
>> On October 19, 2019 5:06:40 PM GMT+02:00, Richard Sandiford
><richard.sandiford@arm.com> wrote:
>>>After the previous patch, it seems more natural to apply the
>>>PARAM_SLP_MAX_INSNS_IN_BB threshold as soon as we know what
>>>the region is, rather than delaying it to vect_slp_analyze_bb_1.
>>>(But rather than carve out the biggest region possible and then
>>>reject it, wouldn't it be better to stop when the region gets
>>>too big, so we at least have a chance of vectorising something?)
>>>
>>>It also seems more natural for vect_slp_bb_region to create the
>>>bb_vec_info itself rather than (a) having to pass bits of data down
>>>for the initialisation and (b) forcing vect_slp_analyze_bb_1 to free
>>>on every failure return.
>>>
>>>Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
>>
>> Ok. But I wonder what the reason was for this limit? Dependency
>analysis was greatly simplified, being no longer quadratic here. Can
>you check where the limit originally came from? But indeed splitting
>the region makes more sense then, but at dataref group boundaries I'd
>say. 
>
>Yeah, looks it was the complexity of dependence analysis:
>
>  https://gcc.gnu.org/ml/gcc-patches/2009-05/msg01303.html

OK. We no longer
Call compute dependence between all memory refs but only verify we can do those code-motions we need to do. That's of course much harder to check a bound on upfront (it's still quadratic in the worst case). I'm also not sure this is ever a problem, but we might instead count the number of stmts involving memory? 

>  > Is there any limit on the size of the BB you consider for
>  > vectorization?  I see we do compute_all_dependences on it - that
>  > might be quite costly.
>
>  I added slp-max-insns-in-bb parameter with initial value 1000.
>
>Thanks,
>Richard
Richard Sandiford Oct. 21, 2019, 10:08 a.m. UTC | #4
Richard Biener <richard.guenther@gmail.com> writes:
> On October 20, 2019 2:54:48 PM GMT+02:00, Richard Sandiford <richard.sandiford@arm.com> wrote:
>>Richard Biener <richard.guenther@gmail.com> writes:
>>> On October 19, 2019 5:06:40 PM GMT+02:00, Richard Sandiford
>><richard.sandiford@arm.com> wrote:
>>>>After the previous patch, it seems more natural to apply the
>>>>PARAM_SLP_MAX_INSNS_IN_BB threshold as soon as we know what
>>>>the region is, rather than delaying it to vect_slp_analyze_bb_1.
>>>>(But rather than carve out the biggest region possible and then
>>>>reject it, wouldn't it be better to stop when the region gets
>>>>too big, so we at least have a chance of vectorising something?)
>>>>
>>>>It also seems more natural for vect_slp_bb_region to create the
>>>>bb_vec_info itself rather than (a) having to pass bits of data down
>>>>for the initialisation and (b) forcing vect_slp_analyze_bb_1 to free
>>>>on every failure return.
>>>>
>>>>Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
>>>
>>> Ok. But I wonder what the reason was for this limit? Dependency
>>analysis was greatly simplified, being no longer quadratic here. Can
>>you check where the limit originally came from? But indeed splitting
>>the region makes more sense then, but at dataref group boundaries I'd
>>say. 
>>
>>Yeah, looks it was the complexity of dependence analysis:
>>
>>  https://gcc.gnu.org/ml/gcc-patches/2009-05/msg01303.html
>
> OK. We no longer
> Call compute dependence between all memory refs but only verify we can do those code-motions we need to do. That's of course much harder to check a bound on upfront (it's still quadratic in the worst case). I'm also not sure this is ever a problem, but we might instead count the number of stmts involving memory? 

OK, here's what I tried.  It exposes quadratic behaviour for
gcc.dg/pr87985.c on x86_64 though.  To make things worse, this is
all in the name of "vectorising" using V1DI. :-)

In that test we have N data references in which the even elements appear
to be vectorisable but the odd elements don't.  First time round, we hit:

  /* For basic block SLP, try to break the group up into multiples of the
     vector size.  */
  unsigned HOST_WIDE_INT const_nunits;
  if (is_a <bb_vec_info> (vinfo)
      && STMT_VINFO_GROUPED_ACCESS (stmt_info)
      && DR_GROUP_FIRST_ELEMENT (stmt_info)
      && nunits.is_constant (&const_nunits))

This breaks off the leading even element and recurses, discards the
leading odd element, and then recurses on the rest.  We then come round
to the same code when recursing on the rest.

It looks like matches is valid for all elements, so I guess we should
divide the group based on all false elements, not just the first?

Richard


2019-10-21  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* params.def (PARAM_SLP_MAX_INSNS_IN_BB): Replace with...
	(PARAM_SLP_MAX_MEMREFS_IN_BB): ...this new --param.
	* doc/invoke.texi: Update accordingly.
	* tree-vect-slp.c (vect_slp_bb): Compute region_end during the
	search loop.  Stop after finding PARAM_SLP_MAX_MEMREFS_IN_BB
	data references.

Index: gcc/params.def
===================================================================
--- gcc/params.def	2019-10-16 10:50:00.064243704 +0100
+++ gcc/params.def	2019-10-21 10:48:45.769347466 +0100
@@ -1030,10 +1030,10 @@ DEFPARAM (PARAM_PROFILE_FUNC_INTERNAL_ID
 	  0, 0, 1)
 
 /* Avoid SLP vectorization of large basic blocks.  */
-DEFPARAM (PARAM_SLP_MAX_INSNS_IN_BB,
-	  "slp-max-insns-in-bb",
-	  "Maximum number of instructions in basic block to be considered for "
-	  "SLP vectorization.", 1000, 0, 0)
+DEFPARAM (PARAM_SLP_MAX_MEMREFS_IN_BB,
+	  "slp-max-memrefs-in-bb",
+	  "Maximum number of memory references to be considered for "
+	  "SLP vectorization.", 1024, 0, 0)
 
 DEFPARAM (PARAM_MIN_INSN_TO_PREFETCH_RATIO,
 	  "min-insn-to-prefetch-ratio",
Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi	2019-10-16 10:49:59.340248881 +0100
+++ gcc/doc/invoke.texi	2019-10-21 10:48:45.769347466 +0100
@@ -12248,9 +12248,8 @@ by the copy loop headers pass.
 @item vect-epilogues-nomask
 Enable loop epilogue vectorization using smaller vector size.
 
-@item slp-max-insns-in-bb
-Maximum number of instructions in basic block to be
-considered for SLP vectorization.
+@item slp-max-memrefs-in-bb
+Maximum number of memory references to be considered for SLP vectorization.
 
 @item avoid-fma-max-bits
 Maximum number of bits for which we avoid creating FMAs.
Index: gcc/tree-vect-slp.c
===================================================================
--- gcc/tree-vect-slp.c	2019-10-21 07:41:32.997886232 +0100
+++ gcc/tree-vect-slp.c	2019-10-21 10:48:45.769347466 +0100
@@ -3075,12 +3075,19 @@ vect_slp_bb (basic_block bb)
   while (!gsi_end_p (gsi))
     {
       gimple_stmt_iterator region_begin = gsi;
+      gimple_stmt_iterator region_end;
       vec<data_reference_p> datarefs = vNULL;
+      unsigned int max_datarefs = PARAM_VALUE (PARAM_SLP_MAX_MEMREFS_IN_BB);
       int insns = 0;
 
-      for (; !gsi_end_p (gsi); gsi_next (&gsi))
+      while (1)
 	{
+	  region_end = gsi;
+	  if (gsi_end_p (gsi) || datarefs.length () >= max_datarefs)
+	    break;
+
 	  gimple *stmt = gsi_stmt (gsi);
+	  gsi_next (&gsi);
 	  if (is_gimple_debug (stmt))
 	    continue;
 	  insns++;
@@ -3089,33 +3096,17 @@ vect_slp_bb (basic_block bb)
 	    vect_location = stmt;
 
 	  if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
+	    /* Don't include the statement in either this region or
+	       the next one.  */
 	    break;
 	}
 
       /* Skip leading unhandled stmts.  */
-      if (gsi_stmt (region_begin) == gsi_stmt (gsi))
-	{
-	  gsi_next (&gsi);
-	  continue;
-	}
+      if (insns == 0)
+	continue;
 
-      gimple_stmt_iterator region_end = gsi;
-
-      if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "not vectorized: too many instructions in "
-			     "basic block.\n");
-	}
-      else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
+      if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
 	any_vectorized = true;
-
-      if (gsi_end_p (region_end))
-	break;
-
-      /* Skip the unhandled stmt.  */
-      gsi_next (&gsi);
     }
 
   return any_vectorized;
Richard Biener Oct. 21, 2019, 1:39 p.m. UTC | #5
On Mon, Oct 21, 2019 at 12:08 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener <richard.guenther@gmail.com> writes:
> > On October 20, 2019 2:54:48 PM GMT+02:00, Richard Sandiford <richard.sandiford@arm.com> wrote:
> >>Richard Biener <richard.guenther@gmail.com> writes:
> >>> On October 19, 2019 5:06:40 PM GMT+02:00, Richard Sandiford
> >><richard.sandiford@arm.com> wrote:
> >>>>After the previous patch, it seems more natural to apply the
> >>>>PARAM_SLP_MAX_INSNS_IN_BB threshold as soon as we know what
> >>>>the region is, rather than delaying it to vect_slp_analyze_bb_1.
> >>>>(But rather than carve out the biggest region possible and then
> >>>>reject it, wouldn't it be better to stop when the region gets
> >>>>too big, so we at least have a chance of vectorising something?)
> >>>>
> >>>>It also seems more natural for vect_slp_bb_region to create the
> >>>>bb_vec_info itself rather than (a) having to pass bits of data down
> >>>>for the initialisation and (b) forcing vect_slp_analyze_bb_1 to free
> >>>>on every failure return.
> >>>>
> >>>>Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
> >>>
> >>> Ok. But I wonder what the reason was for this limit? Dependency
> >>analysis was greatly simplified, being no longer quadratic here. Can
> >>you check where the limit originally came from? But indeed splitting
> >>the region makes more sense then, but at dataref group boundaries I'd
> >>say.
> >>
> >>Yeah, looks it was the complexity of dependence analysis:
> >>
> >>  https://gcc.gnu.org/ml/gcc-patches/2009-05/msg01303.html
> >
> > OK. We no longer
> > Call compute dependence between all memory refs but only verify we can do those code-motions we need to do. That's of course much harder to check a bound on upfront (it's still quadratic in the worst case). I'm also not sure this is ever a problem, but we might instead count the number of stmts involving memory?
>
> OK, here's what I tried.  It exposes quadratic behaviour for
> gcc.dg/pr87985.c on x86_64 though.  To make things worse, this is
> all in the name of "vectorising" using V1DI. :-)
>
> In that test we have N data references in which the even elements appear
> to be vectorisable but the odd elements don't.  First time round, we hit:
>
>   /* For basic block SLP, try to break the group up into multiples of the
>      vector size.  */
>   unsigned HOST_WIDE_INT const_nunits;
>   if (is_a <bb_vec_info> (vinfo)
>       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
>       && DR_GROUP_FIRST_ELEMENT (stmt_info)
>       && nunits.is_constant (&const_nunits))
>
> This breaks off the leading even element and recurses, discards the
> leading odd element, and then recurses on the rest.  We then come round
> to the same code when recursing on the rest.
>
> It looks like matches is valid for all elements, so I guess we should
> divide the group based on all false elements, not just the first?

What do you mean with "valid for all elements"?  matches[i] is
true or false for all elements?  If it is true we shouldn't split
at all.

But indeed the recursion is bad if we split out one vector a step,
we should split the whole group in "half" instead.

Still the code should ensure we have a "half" that works OK
and one that possibly doesn't.

OK, I see we have {true, true, false <repeats ... times> } for
matches but the 2nd iteration gets {true, false, ... } and doesn't
split for me.  Of course if you have V1DI then you'll notice
that matches[0] is _always_ true ... I guess we should simply
never try splitting for const_nunits == 1?  Or even never
do BB vectorization with such a vector type.

Richard.

> Richard
>
>
> 2019-10-21  Richard Sandiford  <richard.sandiford@arm.com>
>
> gcc/
>         * params.def (PARAM_SLP_MAX_INSNS_IN_BB): Replace with...
>         (PARAM_SLP_MAX_MEMREFS_IN_BB): ...this new --param.
>         * doc/invoke.texi: Update accordingly.
>         * tree-vect-slp.c (vect_slp_bb): Compute region_end during the
>         search loop.  Stop after finding PARAM_SLP_MAX_MEMREFS_IN_BB
>         data references.
>
> Index: gcc/params.def
> ===================================================================
> --- gcc/params.def      2019-10-16 10:50:00.064243704 +0100
> +++ gcc/params.def      2019-10-21 10:48:45.769347466 +0100
> @@ -1030,10 +1030,10 @@ DEFPARAM (PARAM_PROFILE_FUNC_INTERNAL_ID
>           0, 0, 1)
>
>  /* Avoid SLP vectorization of large basic blocks.  */
> -DEFPARAM (PARAM_SLP_MAX_INSNS_IN_BB,
> -         "slp-max-insns-in-bb",
> -         "Maximum number of instructions in basic block to be considered for "
> -         "SLP vectorization.", 1000, 0, 0)
> +DEFPARAM (PARAM_SLP_MAX_MEMREFS_IN_BB,
> +         "slp-max-memrefs-in-bb",
> +         "Maximum number of memory references to be considered for "
> +         "SLP vectorization.", 1024, 0, 0)
>
>  DEFPARAM (PARAM_MIN_INSN_TO_PREFETCH_RATIO,
>           "min-insn-to-prefetch-ratio",
> Index: gcc/doc/invoke.texi
> ===================================================================
> --- gcc/doc/invoke.texi 2019-10-16 10:49:59.340248881 +0100
> +++ gcc/doc/invoke.texi 2019-10-21 10:48:45.769347466 +0100
> @@ -12248,9 +12248,8 @@ by the copy loop headers pass.
>  @item vect-epilogues-nomask
>  Enable loop epilogue vectorization using smaller vector size.
>
> -@item slp-max-insns-in-bb
> -Maximum number of instructions in basic block to be
> -considered for SLP vectorization.
> +@item slp-max-memrefs-in-bb
> +Maximum number of memory references to be considered for SLP vectorization.
>
>  @item avoid-fma-max-bits
>  Maximum number of bits for which we avoid creating FMAs.
> Index: gcc/tree-vect-slp.c
> ===================================================================
> --- gcc/tree-vect-slp.c 2019-10-21 07:41:32.997886232 +0100
> +++ gcc/tree-vect-slp.c 2019-10-21 10:48:45.769347466 +0100
> @@ -3075,12 +3075,19 @@ vect_slp_bb (basic_block bb)
>    while (!gsi_end_p (gsi))
>      {
>        gimple_stmt_iterator region_begin = gsi;
> +      gimple_stmt_iterator region_end;
>        vec<data_reference_p> datarefs = vNULL;
> +      unsigned int max_datarefs = PARAM_VALUE (PARAM_SLP_MAX_MEMREFS_IN_BB);
>        int insns = 0;
>
> -      for (; !gsi_end_p (gsi); gsi_next (&gsi))
> +      while (1)
>         {
> +         region_end = gsi;
> +         if (gsi_end_p (gsi) || datarefs.length () >= max_datarefs)
> +           break;
> +
>           gimple *stmt = gsi_stmt (gsi);
> +         gsi_next (&gsi);
>           if (is_gimple_debug (stmt))
>             continue;
>           insns++;
> @@ -3089,33 +3096,17 @@ vect_slp_bb (basic_block bb)
>             vect_location = stmt;
>
>           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs))
> +           /* Don't include the statement in either this region or
> +              the next one.  */
>             break;
>         }
>
>        /* Skip leading unhandled stmts.  */
> -      if (gsi_stmt (region_begin) == gsi_stmt (gsi))
> -       {
> -         gsi_next (&gsi);
> -         continue;
> -       }
> +      if (insns == 0)
> +       continue;
>
> -      gimple_stmt_iterator region_end = gsi;
> -
> -      if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
> -       {
> -         if (dump_enabled_p ())
> -           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                            "not vectorized: too many instructions in "
> -                            "basic block.\n");
> -       }
> -      else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
> +      if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
>         any_vectorized = true;
> -
> -      if (gsi_end_p (region_end))
> -       break;
> -
> -      /* Skip the unhandled stmt.  */
> -      gsi_next (&gsi);
>      }
>
>    return any_vectorized;
Richard Sandiford Oct. 21, 2019, 2:06 p.m. UTC | #6
Richard Biener <richard.guenther@gmail.com> writes:
> On Mon, Oct 21, 2019 at 12:08 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> Richard Biener <richard.guenther@gmail.com> writes:
>> > On October 20, 2019 2:54:48 PM GMT+02:00, Richard Sandiford <richard.sandiford@arm.com> wrote:
>> >>Richard Biener <richard.guenther@gmail.com> writes:
>> >>> On October 19, 2019 5:06:40 PM GMT+02:00, Richard Sandiford
>> >><richard.sandiford@arm.com> wrote:
>> >>>>After the previous patch, it seems more natural to apply the
>> >>>>PARAM_SLP_MAX_INSNS_IN_BB threshold as soon as we know what
>> >>>>the region is, rather than delaying it to vect_slp_analyze_bb_1.
>> >>>>(But rather than carve out the biggest region possible and then
>> >>>>reject it, wouldn't it be better to stop when the region gets
>> >>>>too big, so we at least have a chance of vectorising something?)
>> >>>>
>> >>>>It also seems more natural for vect_slp_bb_region to create the
>> >>>>bb_vec_info itself rather than (a) having to pass bits of data down
>> >>>>for the initialisation and (b) forcing vect_slp_analyze_bb_1 to free
>> >>>>on every failure return.
>> >>>>
>> >>>>Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
>> >>>
>> >>> Ok. But I wonder what the reason was for this limit? Dependency
>> >>analysis was greatly simplified, being no longer quadratic here. Can
>> >>you check where the limit originally came from? But indeed splitting
>> >>the region makes more sense then, but at dataref group boundaries I'd
>> >>say.
>> >>
>> >>Yeah, looks it was the complexity of dependence analysis:
>> >>
>> >>  https://gcc.gnu.org/ml/gcc-patches/2009-05/msg01303.html
>> >
>> > OK. We no longer
>> > Call compute dependence between all memory refs but only verify we can do those code-motions we need to do. That's of course much harder to check a bound on upfront (it's still quadratic in the worst case). I'm also not sure this is ever a problem, but we might instead count the number of stmts involving memory?
>>
>> OK, here's what I tried.  It exposes quadratic behaviour for
>> gcc.dg/pr87985.c on x86_64 though.  To make things worse, this is
>> all in the name of "vectorising" using V1DI. :-)
>>
>> In that test we have N data references in which the even elements appear
>> to be vectorisable but the odd elements don't.  First time round, we hit:
>>
>>   /* For basic block SLP, try to break the group up into multiples of the
>>      vector size.  */
>>   unsigned HOST_WIDE_INT const_nunits;
>>   if (is_a <bb_vec_info> (vinfo)
>>       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
>>       && DR_GROUP_FIRST_ELEMENT (stmt_info)
>>       && nunits.is_constant (&const_nunits))
>>
>> This breaks off the leading even element and recurses, discards the
>> leading odd element, and then recurses on the rest.  We then come round
>> to the same code when recursing on the rest.
>>
>> It looks like matches is valid for all elements, so I guess we should
>> divide the group based on all false elements, not just the first?
>
> What do you mean with "valid for all elements"?  matches[i] is
> true or false for all elements?  If it is true we shouldn't split
> at all.

Initially I'd wondered whether the elements were undefined after the
first "false", i.e. if we broke early at that point.  But it looks like
true and false elements after the first false are still meaningful.

> But indeed the recursion is bad if we split out one vector a step,
> we should split the whole group in "half" instead.
>
> Still the code should ensure we have a "half" that works OK
> and one that possibly doesn't.
>
> OK, I see we have {true, true, false <repeats ... times> } for
> matches but the 2nd iteration gets {true, false, ... } and doesn't
> split for me.  Of course if you have V1DI then you'll notice
> that matches[0] is _always_ true ... I guess we should simply
> never try splitting for const_nunits == 1?  Or even never
> do BB vectorization with such a vector type.

I don't think it's specific to const_nunits == 1.  E.g. if we have
matches == { true, true, false, true } repeating for const_nunits == 2,
we'll have the same problem of trying:

  N
  2, N-4
     2, N-8
        2, N-12,
           2, N-16
              ....

which is still quadratic.

Dividing genuinely into half would fix that, so would dividing the
whole group based on the group1_size.  Or we could try using the
whole matches array to divide the group up into chunks that are
worth recursing on, with each chunk having at most half the original
group size and with the matches elements for each chunk being all-true
or all-false.

Thanks,
Richard
Richard Biener Oct. 21, 2019, 3:56 p.m. UTC | #7
On October 21, 2019 4:06:49 PM GMT+02:00, Richard Sandiford <richard.sandiford@arm.com> wrote:
>Richard Biener <richard.guenther@gmail.com> writes:
>> On Mon, Oct 21, 2019 at 12:08 PM Richard Sandiford
>> <richard.sandiford@arm.com> wrote:
>>>
>>> Richard Biener <richard.guenther@gmail.com> writes:
>>> > On October 20, 2019 2:54:48 PM GMT+02:00, Richard Sandiford
><richard.sandiford@arm.com> wrote:
>>> >>Richard Biener <richard.guenther@gmail.com> writes:
>>> >>> On October 19, 2019 5:06:40 PM GMT+02:00, Richard Sandiford
>>> >><richard.sandiford@arm.com> wrote:
>>> >>>>After the previous patch, it seems more natural to apply the
>>> >>>>PARAM_SLP_MAX_INSNS_IN_BB threshold as soon as we know what
>>> >>>>the region is, rather than delaying it to vect_slp_analyze_bb_1.
>>> >>>>(But rather than carve out the biggest region possible and then
>>> >>>>reject it, wouldn't it be better to stop when the region gets
>>> >>>>too big, so we at least have a chance of vectorising something?)
>>> >>>>
>>> >>>>It also seems more natural for vect_slp_bb_region to create the
>>> >>>>bb_vec_info itself rather than (a) having to pass bits of data
>down
>>> >>>>for the initialisation and (b) forcing vect_slp_analyze_bb_1 to
>free
>>> >>>>on every failure return.
>>> >>>>
>>> >>>>Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to
>install?
>>> >>>
>>> >>> Ok. But I wonder what the reason was for this limit? Dependency
>>> >>analysis was greatly simplified, being no longer quadratic here.
>Can
>>> >>you check where the limit originally came from? But indeed
>splitting
>>> >>the region makes more sense then, but at dataref group boundaries
>I'd
>>> >>say.
>>> >>
>>> >>Yeah, looks it was the complexity of dependence analysis:
>>> >>
>>> >>  https://gcc.gnu.org/ml/gcc-patches/2009-05/msg01303.html
>>> >
>>> > OK. We no longer
>>> > Call compute dependence between all memory refs but only verify we
>can do those code-motions we need to do. That's of course much harder
>to check a bound on upfront (it's still quadratic in the worst case).
>I'm also not sure this is ever a problem, but we might instead count
>the number of stmts involving memory?
>>>
>>> OK, here's what I tried.  It exposes quadratic behaviour for
>>> gcc.dg/pr87985.c on x86_64 though.  To make things worse, this is
>>> all in the name of "vectorising" using V1DI. :-)
>>>
>>> In that test we have N data references in which the even elements
>appear
>>> to be vectorisable but the odd elements don't.  First time round, we
>hit:
>>>
>>>   /* For basic block SLP, try to break the group up into multiples
>of the
>>>      vector size.  */
>>>   unsigned HOST_WIDE_INT const_nunits;
>>>   if (is_a <bb_vec_info> (vinfo)
>>>       && STMT_VINFO_GROUPED_ACCESS (stmt_info)
>>>       && DR_GROUP_FIRST_ELEMENT (stmt_info)
>>>       && nunits.is_constant (&const_nunits))
>>>
>>> This breaks off the leading even element and recurses, discards the
>>> leading odd element, and then recurses on the rest.  We then come
>round
>>> to the same code when recursing on the rest.
>>>
>>> It looks like matches is valid for all elements, so I guess we
>should
>>> divide the group based on all false elements, not just the first?
>>
>> What do you mean with "valid for all elements"?  matches[i] is
>> true or false for all elements?  If it is true we shouldn't split
>> at all.
>
>Initially I'd wondered whether the elements were undefined after the
>first "false", i.e. if we broke early at that point.  But it looks like
>true and false elements after the first false are still meaningful.
>
>> But indeed the recursion is bad if we split out one vector a step,
>> we should split the whole group in "half" instead.
>>
>> Still the code should ensure we have a "half" that works OK
>> and one that possibly doesn't.
>>
>> OK, I see we have {true, true, false <repeats ... times> } for
>> matches but the 2nd iteration gets {true, false, ... } and doesn't
>> split for me.  Of course if you have V1DI then you'll notice
>> that matches[0] is _always_ true ... I guess we should simply
>> never try splitting for const_nunits == 1?  Or even never
>> do BB vectorization with such a vector type.
>
>I don't think it's specific to const_nunits == 1.  E.g. if we have
>matches == { true, true, false, true } repeating for const_nunits == 2,
>we'll have the same problem of trying:
>
>  N
>  2, N-4
>     2, N-8
>        2, N-12,
>           2, N-16
>              ....
>
>which is still quadratic.

Yeah. 

>Dividing genuinely into half would fix that, so would dividing the
>whole group based on the group1_size. 

I think the smaller split was meant as optimization and originally we didn't recurse on the rest... So yes, simply halving might help. 

 Or we could try using the
>whole matches array to divide the group up into chunks that are
>worth recursing on, with each chunk having at most half the original
>group size and with the matches elements for each chunk being all-true
>or all-false.
>
>Thanks,
>Richard
diff mbox series

Patch

Index: gcc/tree-vect-slp.c
===================================================================
--- gcc/tree-vect-slp.c	2019-10-19 15:59:53.875379473 +0100
+++ gcc/tree-vect-slp.c	2019-10-19 16:01:13.714824484 +0100
@@ -2836,47 +2836,24 @@  vect_bb_vectorization_profitable_p (bb_v
   return true;
 }
 
-/* Check if the basic block can be vectorized.  Returns a bb_vec_info
-   if so and sets fatal to true if failure is independent of
-   current_vector_size.  */
-
-static bb_vec_info
-vect_slp_analyze_bb_1 (gimple_stmt_iterator region_begin,
-		       gimple_stmt_iterator region_end,
-		       vec<data_reference_p> datarefs, int n_stmts,
-		       bool &fatal, vec_info_shared *shared)
+/* Check if the region described by BB_VINFO can be vectorized, returning
+   true if so.  When returning false, set FATAL to true if the same failure
+   would prevent vectorization at other vector sizes, false if it is still
+   worth trying other sizes.  N_STMTS is the number of statements in the
+   region.  */
+
+static bool
+vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal)
 {
   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
 
-  bb_vec_info bb_vinfo;
   slp_instance instance;
   int i;
   poly_uint64 min_vf = 2;
-  bool first_time_p = shared->datarefs.is_empty ();
 
   /* The first group of checks is independent of the vector size.  */
   fatal = true;
 
-  if (n_stmts > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
-    {
-      if (dump_enabled_p ())
-	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			 "not vectorized: too many instructions in "
-			 "basic block.\n");
-      free_data_refs (datarefs);
-      return NULL;
-    }
-
-  bb_vinfo = new _bb_vec_info (region_begin, region_end, shared);
-  if (!bb_vinfo)
-    return NULL;
-
-  BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
-  if (first_time_p)
-    bb_vinfo->shared->save_datarefs ();
-  else
-    bb_vinfo->shared->check_datarefs ();
-
   /* Analyze the data references.  */
 
   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
@@ -2885,9 +2862,7 @@  vect_slp_analyze_bb_1 (gimple_stmt_itera
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			 "not vectorized: unhandled data-ref in basic "
 			 "block.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   if (BB_VINFO_DATAREFS (bb_vinfo).length () < 2)
@@ -2896,9 +2871,7 @@  vect_slp_analyze_bb_1 (gimple_stmt_itera
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			 "not vectorized: not enough data-refs in "
 			 "basic block.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   if (!vect_analyze_data_ref_accesses (bb_vinfo))
@@ -2907,9 +2880,7 @@  vect_slp_analyze_bb_1 (gimple_stmt_itera
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			"not vectorized: unhandled data access in "
 			"basic block.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   /* If there are no grouped stores in the region there is no need
@@ -2921,9 +2892,7 @@  vect_slp_analyze_bb_1 (gimple_stmt_itera
 	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			 "not vectorized: no grouped stores in "
 			 "basic block.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   /* While the rest of the analysis below depends on it in some way.  */
@@ -2943,9 +2912,7 @@  vect_slp_analyze_bb_1 (gimple_stmt_itera
 			   "not vectorized: failed to find SLP opportunities "
 			   "in basic block.\n");
 	}
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   vect_record_base_alignments (bb_vinfo);
@@ -2976,19 +2943,14 @@  vect_slp_analyze_bb_1 (gimple_stmt_itera
       i++;
     }
   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
-    {
-      delete bb_vinfo;
-      return NULL;
-    }
+    return false;
 
   if (!vect_slp_analyze_operations (bb_vinfo))
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			 "not vectorized: bad operation in basic block.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   /* Cost model: check if the vectorization is worthwhile.  */
@@ -2999,16 +2961,13 @@  vect_slp_analyze_bb_1 (gimple_stmt_itera
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			 "not vectorized: vectorization is not "
 			 "profitable.\n");
-
-      delete bb_vinfo;
-      return NULL;
+      return false;
     }
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "Basic block will be vectorized using SLP\n");
-
-  return bb_vinfo;
+  return true;
 }
 
 /* Subroutine of vect_slp_bb.  Try to vectorize the statements between
@@ -3037,9 +2996,16 @@  vect_slp_bb_region (gimple_stmt_iterator
     {
       bool vectorized = false;
       bool fatal = false;
-      bb_vinfo = vect_slp_analyze_bb_1 (region_begin, region_end,
-					datarefs, n_stmts, fatal, &shared);
-      if (bb_vinfo
+      bb_vinfo = new _bb_vec_info (region_begin, region_end, &shared);
+
+      bool first_time_p = shared.datarefs.is_empty ();
+      BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
+      if (first_time_p)
+	bb_vinfo->shared->save_datarefs ();
+      else
+	bb_vinfo->shared->check_datarefs ();
+
+      if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal)
 	  && dbg_cnt (vect_slp))
 	{
 	  if (dump_enabled_p ())
@@ -3132,7 +3098,14 @@  vect_slp_bb (basic_block bb)
 
       gimple_stmt_iterator region_end = gsi;
 
-      if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
+      if (insns > PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "not vectorized: too many instructions in "
+			     "basic block.\n");
+	}
+      else if (vect_slp_bb_region (region_begin, region_end, datarefs, insns))
 	any_vectorized = true;
 
       if (gsi_end_p (region_end))