diff mbox

Misaligned data references and cost model for basic block SLP

Message ID OF463AC6DC.FBABCB2B-ONC225777D.0026FE66-C2257780.00268E78@il.ibm.com
State New
Headers show

Commit Message

Ira Rosen Aug. 15, 2010, 7:01 a.m. UTC
Hi,

This patch adds support of misaligned data accesses in basic block SLP.
Since this feature can make vectorization not profitable, this patch also
adds a simple cost model for basic blocks.

Bootstrapped and tested on powerpc64-suse-linux.
Committed.

Ira


ChangeLog:

	* tree-vect-data-refs.c (vect_setup_realignment): Support realignment
	in basic blocks.
	(vect_supportable_dr_alignment): Check alignment for basic blocks.
	* tree-vect-slp.c (vect_build_slp_tree): Allow different codes for
	data references.
	(vect_bb_vectorization_profitable_p): New function.
	(vect_slp_analyze_bb): Call vect_bb_vectorization_profitable_p() to
	check if it's worthwhile to vectorize the basic block.

testsuite/ChangeLog:

	* gcc.dg/vect/costmodel/ppc/costmodel-bb-slp-9a.c: New test.
	* gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp: Run basic block
	SLP tests.
	* gcc.dg/vect/bb-slp-9.c: Now vectorizable on targets that support
	misaligned loads.
	* gcc.dg/vect/bb-slp-10.c: Now vectorizable on targets that support
	misaligned stores.c
	* gcc.dg/vect/bb-slp-2.c: Avoid loop vectorization.

Comments

H.J. Lu Sept. 1, 2010, 1:13 a.m. UTC | #1
On Sun, Aug 15, 2010 at 12:01 AM, Ira Rosen <IRAR@il.ibm.com> wrote:
>
> Hi,
>
> This patch adds support of misaligned data accesses in basic block SLP.
> Since this feature can make vectorization not profitable, this patch also
> adds a simple cost model for basic blocks.
>
> Bootstrapped and tested on powerpc64-suse-linux.
> Committed.
>
> Ira
>
>
> ChangeLog:
>
>        * tree-vect-data-refs.c (vect_setup_realignment): Support realignment
>        in basic blocks.
>        (vect_supportable_dr_alignment): Check alignment for basic blocks.
>        * tree-vect-slp.c (vect_build_slp_tree): Allow different codes for
>        data references.
>        (vect_bb_vectorization_profitable_p): New function.
>        (vect_slp_analyze_bb): Call vect_bb_vectorization_profitable_p() to
>        check if it's worthwhile to vectorize the basic block.
>
> testsuite/ChangeLog:
>
>        * gcc.dg/vect/costmodel/ppc/costmodel-bb-slp-9a.c: New test.
>        * gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp: Run basic block
>        SLP tests.
>        * gcc.dg/vect/bb-slp-9.c: Now vectorizable on targets that support
>        misaligned loads.
>        * gcc.dg/vect/bb-slp-10.c: Now vectorizable on targets that support
>        misaligned stores.c
>        * gcc.dg/vect/bb-slp-2.c: Avoid loop vectorization.
>
>

This caused:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=45470
diff mbox

Patch

Index: tree-vect-data-refs.c
===================================================================
--- tree-vect-data-refs.c	(revision 163259)
+++ tree-vect-data-refs.c	(working copy)
@@ -3467,8 +3467,8 @@  vect_setup_realignment (gimple stmt, gim
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-  edge pe;
+  struct loop *loop = NULL;
+  edge pe = NULL;
   tree scalar_dest = gimple_assign_lhs (stmt);
   tree vec_dest;
   gimple inc;
@@ -3483,9 +3483,15 @@  vect_setup_realignment (gimple stmt, gim
   gimple_seq stmts = NULL;
   bool inv_p;
   bool compute_in_loop = false;
-  bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
+  bool nested_in_vect_loop = false;
   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
-  struct loop *loop_for_initial_load;
+  struct loop *loop_for_initial_load = NULL;
+
+  if (loop_vinfo)
+    {
+      loop = LOOP_VINFO_LOOP (loop_vinfo);
+      nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
+    }

   gcc_assert (alignment_support_scheme == dr_explicit_realign
 	      || alignment_support_scheme == dr_explicit_realign_optimized);
@@ -3523,7 +3529,7 @@  vect_setup_realignment (gimple stmt, gim
      or not, which in turn determines if the misalignment is computed
inside
      the inner-loop, or outside LOOP.  */

-  if (init_addr != NULL_TREE)
+  if (init_addr != NULL_TREE || !loop_vinfo)
     {
       compute_in_loop = true;
       gcc_assert (alignment_support_scheme == dr_explicit_realign);
@@ -3555,6 +3561,9 @@  vect_setup_realignment (gimple stmt, gim
   if (at_loop)
     *at_loop = loop_for_initial_load;

+  if (loop_for_initial_load)
+    pe = loop_preheader_edge (loop_for_initial_load);
+
   /* 3. For the case of the optimized realignment, create the first vector
       load at the loop preheader.  */

@@ -3563,7 +3572,6 @@  vect_setup_realignment (gimple stmt, gim
       /* Create msq_init = *(floor(p1)) in the loop preheader  */

       gcc_assert (!compute_in_loop);
-      pe = loop_preheader_edge (loop_for_initial_load);
       vec_dest = vect_create_destination_var (scalar_dest, vectype);
       ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load,
NULL_TREE,
 				      &init_addr, &inc, true, &inv_p);
@@ -3582,8 +3590,14 @@  vect_setup_realignment (gimple stmt, gim
       new_temp = make_ssa_name (vec_dest, new_stmt);
       gimple_assign_set_lhs (new_stmt, new_temp);
       mark_symbols_for_renaming (new_stmt);
-      new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
-      gcc_assert (!new_bb);
+      if (pe)
+        {
+          new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
+          gcc_assert (!new_bb);
+        }
+      else
+         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+
       msq_init = gimple_assign_lhs (new_stmt);
     }

@@ -3596,16 +3610,19 @@  vect_setup_realignment (gimple stmt, gim
       tree builtin_decl;

       /* Compute INIT_ADDR - the initial addressed accessed by this
memref.  */
-      if (compute_in_loop)
-	gcc_assert (init_addr); /* already computed by the caller.  */
-      else
+      if (!init_addr)
 	{
 	  /* Generate the INIT_ADDR computation outside LOOP.  */
 	  init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
 							NULL_TREE, loop);
-	  pe = loop_preheader_edge (loop);
-	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
-	  gcc_assert (!new_bb);
+          if (loop)
+            {
+   	      pe = loop_preheader_edge (loop);
+	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+	      gcc_assert (!new_bb);
+            }
+          else
+             gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
 	}

       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
@@ -3979,12 +3996,11 @@  vect_supportable_dr_alignment (struct da
   if (aligned_access_p (dr) && !check_aligned_accesses)
     return dr_aligned;

-  if (!loop_vinfo)
-    /* FORNOW: Misaligned accesses are supported only in loops.  */
-    return dr_unaligned_unsupported;
-
-  vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
-  nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
+  if (loop_vinfo)
+    {
+      vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
+      nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
+    }

   /* Possibly unaligned access.  */

@@ -4059,9 +4075,10 @@  vect_supportable_dr_alignment (struct da
 	      || targetm.vectorize.builtin_mask_for_load ()))
 	{
 	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-	  if (nested_in_vect_loop
-	      && (TREE_INT_CST_LOW (DR_STEP (dr))
-		  != GET_MODE_SIZE (TYPE_MODE (vectype))))
+	  if ((nested_in_vect_loop
+	       && (TREE_INT_CST_LOW (DR_STEP (dr))
+	 	   != GET_MODE_SIZE (TYPE_MODE (vectype))))
+              || !loop_vinfo)
 	    return dr_explicit_realign;
 	  else
 	    return dr_explicit_realign_optimized;
Index: tree-vect-slp.c
===================================================================
--- tree-vect-slp.c	(revision 163259)
+++ tree-vect-slp.c	(working copy)
@@ -456,7 +456,12 @@  vect_build_slp_tree (loop_vec_info loop_
 	      && (first_stmt_code != IMAGPART_EXPR
 		  || rhs_code != REALPART_EXPR)
 	      && (first_stmt_code != REALPART_EXPR
-		  || rhs_code != IMAGPART_EXPR))
+		  || rhs_code != IMAGPART_EXPR)
+              && !(STMT_VINFO_STRIDED_ACCESS (vinfo_for_stmt (stmt))
+                   && (first_stmt_code == ARRAY_REF
+                       || first_stmt_code == INDIRECT_REF
+                       || first_stmt_code == COMPONENT_REF
+                       || first_stmt_code == MEM_REF)))
 	    {
 	      if (vect_print_dump_info (REPORT_SLP))
 		{
@@ -1509,7 +1514,75 @@  vect_slp_analyze_operations (bb_vec_info
 }


-/* Cheick if the basic block can be vectorized.  */
+/* Check if vectorization of the basic block is profitable.  */
+
+static bool
+vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
+{
+  VEC (slp_instance, heap) *slp_instances = BB_VINFO_SLP_INSTANCES
(bb_vinfo);
+  slp_instance instance;
+  int i;
+  unsigned int vec_outside_cost = 0, vec_inside_cost = 0, scalar_cost = 0;
+  unsigned int stmt_cost;
+  gimple stmt;
+  gimple_stmt_iterator si;
+  basic_block bb = BB_VINFO_BB (bb_vinfo);
+  stmt_vec_info stmt_info = NULL;
+  tree dummy_type = NULL;
+  int dummy = 0;
+
+  /* Calculate vector costs.  */
+  for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
+    {
+      vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
+      vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
+    }
+
+  /* Calculate scalar cost.  */
+  for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
+    {
+      stmt = gsi_stmt (si);
+      stmt_info = vinfo_for_stmt (stmt);
+
+      if (!stmt_info || !STMT_VINFO_VECTORIZABLE (stmt_info)
+          || !PURE_SLP_STMT (stmt_info))
+        continue;
+
+      if (STMT_VINFO_DATA_REF (stmt_info))
+        {
+          if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
+            stmt_cost = targetm.vectorize.builtin_vectorization_cost
+                          (scalar_load, dummy_type, dummy);
+          else
+            stmt_cost = targetm.vectorize.builtin_vectorization_cost
+                          (scalar_store, dummy_type, dummy);
+        }
+      else
+        stmt_cost = targetm.vectorize.builtin_vectorization_cost
+                      (scalar_stmt, dummy_type, dummy);
+
+      scalar_cost += stmt_cost;
+    }
+
+  if (vect_print_dump_info (REPORT_COST))
+    {
+      fprintf (vect_dump, "Cost model analysis: \n");
+      fprintf (vect_dump, "  Vector inside of basic block cost: %d\n",
+               vec_inside_cost);
+      fprintf (vect_dump, "  Vector outside of basic block cost: %d\n",
+               vec_outside_cost);
+      fprintf (vect_dump, "  Scalar cost of basic block: %d",
scalar_cost);
+    }
+
+  /* Vectorization is profitable if its cost is less than the cost of
scalar
+     version.  */
+  if (vec_outside_cost + vec_inside_cost >= scalar_cost)
+    return false;
+
+  return true;
+}
+
+/* Check if the basic block can be vectorized.  */

 bb_vec_info
 vect_slp_analyze_bb (basic_block bb)
@@ -1641,6 +1714,18 @@  vect_slp_analyze_bb (basic_block bb)
       return NULL;
     }

+  /* Cost model: check if the vectorization is worthwhile.  */
+  if (flag_vect_cost_model
+      && !vect_bb_vectorization_profitable_p (bb_vinfo))
+    {
+      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+        fprintf (vect_dump, "not vectorized: vectorization is not "
+                            "profitable.\n");
+
+      destroy_bb_vec_info (bb_vinfo);
+      return NULL;
+    }
+
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "Basic block will be vectorized using SLP\n");
 Index: testsuite/gcc.dg/vect/costmodel/ppc/costmodel-bb-slp-9a.c
===================================================================
--- testsuite/gcc.dg/vect/costmodel/ppc/costmodel-bb-slp-9a.c	(revision
0)
+++ testsuite/gcc.dg/vect/costmodel/ppc/costmodel-bb-slp-9a.c	(revision
0)
@@ -0,0 +1,47 @@ 
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "../../tree-vect.h"
+
+#define N 16
+
+unsigned int out[N];
+unsigned int in[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+
+__attribute__ ((noinline)) int
+main1 (unsigned int x, unsigned int y)
+{
+  int i;
+  unsigned int *pin = &in[1];
+  unsigned int *pout = &out[0];
+  unsigned int a0, a1, a2, a3;
+
+  /* Misaligned load.  */
+  *pout++ = *pin++;
+  *pout++ = *pin++;
+  *pout++ = *pin++;
+  *pout++ = *pin++;
+
+  /* Check results.  */
+  if (out[0] != in[1]
+      || out[1] != in[2]
+      || out[2] != in[3]
+      || out[3] != in[4])
+    abort();
+
+  return 0;
+}
+
+int main (void)
+{
+  check_vect ();
+
+  main1 (2, 3);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 1
"slp"  { xfail  vect_no_align } } } */
+/* { dg-final { cleanup-tree-dump "slp" } } */
+
Index: testsuite/gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp
===================================================================
--- testsuite/gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp	(revision
163259)
+++ testsuite/gcc.dg/vect/costmodel/ppc/ppc-costmodel-vect.exp	(working
copy)
@@ -57,7 +57,10 @@  if [check_vmx_hw_available] {
 # Initialize `dg'.
 dg-init

+set VECT_SLP_CFLAGS $DEFAULT_VECTCFLAGS
+
 lappend DEFAULT_VECTCFLAGS "-fdump-tree-vect-details"
+lappend VECT_SLP_CFLAGS "-fdump-tree-slp-details"

 # Main loop.
 dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/costmodel-pr*.\[cS\]]]
\
@@ -66,6 +69,8 @@  dg-runtest [lsort [glob -nocomplain $src
 	"" $DEFAULT_VECTCFLAGS
 dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/costmodel-slp-*.\[cS
\]]]  \
         "" $DEFAULT_VECTCFLAGS
+dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/costmodel-bb-slp*.\[cS
\]]]  \
+        "" $VECT_SLP_CFLAGS

 #### Tests with special options
 global SAVED_DEFAULT_VECTCFLAGS
Index: testsuite/gcc.dg/vect/bb-slp-9.c
===================================================================
--- testsuite/gcc.dg/vect/bb-slp-9.c	(revision 163259)
+++ testsuite/gcc.dg/vect/bb-slp-9.c	(working copy)
@@ -47,7 +47,6 @@  int main (void)
   return 0;
 }

-/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 0
"slp" } } */
-/* { dg-final { scan-tree-dump-times "unsupported alignment in basic
block." 1 "slp" } } */
+/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 1
"slp"  { xfail  vect_no_align } } } */
 /* { dg-final { cleanup-tree-dump "slp" } } */

Index: testsuite/gcc.dg/vect/bb-slp-10.c
===================================================================
--- testsuite/gcc.dg/vect/bb-slp-10.c	(revision 163259)
+++ testsuite/gcc.dg/vect/bb-slp-10.c	(working copy)
@@ -50,7 +50,7 @@  int main (void)
   return 0;
 }

-/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 0
"slp" } } */
-/* { dg-final { scan-tree-dump-times "unsupported alignment in basic
block." 1 "slp" } } */
+/* { dg-final { scan-tree-dump-times "unsupported alignment in basic
block." 1 "slp" { xfail vect_hw_misalign } } } */
+/* { dg-final { scan-tree-dump-times "basic block vectorized using SLP" 1
"slp" { target vect_hw_misalign } } } */
 /* { dg-final { cleanup-tree-dump "slp" } } */

Index: testsuite/gcc.dg/vect/bb-slp-2.c
===================================================================
--- testsuite/gcc.dg/vect/bb-slp-2.c	(revision 163259)
+++ testsuite/gcc.dg/vect/bb-slp-2.c	(working copy)
@@ -24,8 +24,8 @@  main1 (int dummy)
       *pout++ = *pin++;

       /* Avoid loop vectorization.  */
-      if (dummy == 32)
-        abort ();
+      if (dummy)
+        __asm__ volatile ("" : : : "memory");
     }

   /* check results:  */