Patchwork Fix PR tree-optimization/49038

login
register
mail settings
Submitter Ira Rosen
Date May 26, 2011, 7:52 a.m.
Message ID <BANLkTimVHPK0gKUYsfKNdoknJ9PpfDJ9qQ@mail.gmail.com>
Download mbox | patch
Permalink /patch/97504/
State New
Headers show

Comments

Ira Rosen - May 26, 2011, 7:52 a.m.
Hi,

The vectorizer supports strided loads with gaps, e.g., when only a[4i]
and a[4i+2] are accessed, it generates a vector load a[4i:4i+3], i.e.,
creating an access to a[4i+3], which doesn't exist in the scalar code.
This access maybe invalid as described in the PR.

This patch creates an epilogue loop (with at least one iteration) for
such cases.

Bootstrapped and tested on powerpc64-suse-linux.
Applied to trunk. I'll prepare patches for 4.5 and 4.6 next week.

Ira


ChangeLog:

	PR tree-optimization/49038
	* tree-vect-loop-manip.c (vect_generate_tmps_on_preheader):
	Ensure at least one epilogue iteration if required by data
	accesses with gaps.
	* tree-vectorizer.h (struct _loop_vec_info): Add new field
	to mark loops that require peeling for gaps.
	* tree-vect-loop.c (new_loop_vec_info): Initialize new field.
	(vect_get_known_peeling_cost): Take peeling for gaps into
	account.
	(vect_transform_loop): Generate epilogue if required by data
	access with gaps.
	* tree-vect-data-refs.c (vect_analyze_group_access): Mark the
	loop as requiring an epilogue if there are gaps in the end of
	the strided group.

testsuite/ChangeLog:

	PR tree-optimization/49038
	* gcc.dg/vect/vect-strided-u8-i8-gap4-unknown.c: New test.
	* gcc.dg/vect/pr49038.c: New test.
H.J. Lu - May 31, 2011, 4:53 a.m.
On Thu, May 26, 2011 at 12:52 AM, Ira Rosen <ira.rosen@linaro.org> wrote:
> Hi,
>
> The vectorizer supports strided loads with gaps, e.g., when only a[4i]
> and a[4i+2] are accessed, it generates a vector load a[4i:4i+3], i.e.,
> creating an access to a[4i+3], which doesn't exist in the scalar code.
> This access maybe invalid as described in the PR.
>
> This patch creates an epilogue loop (with at least one iteration) for
> such cases.
>
> Bootstrapped and tested on powerpc64-suse-linux.
> Applied to trunk. I'll prepare patches for 4.5 and 4.6 next week.
>
> Ira
>
>
> ChangeLog:
>
>        PR tree-optimization/49038
>        * tree-vect-loop-manip.c (vect_generate_tmps_on_preheader):
>        Ensure at least one epilogue iteration if required by data
>        accesses with gaps.
>        * tree-vectorizer.h (struct _loop_vec_info): Add new field
>        to mark loops that require peeling for gaps.
>        * tree-vect-loop.c (new_loop_vec_info): Initialize new field.
>        (vect_get_known_peeling_cost): Take peeling for gaps into
>        account.
>        (vect_transform_loop): Generate epilogue if required by data
>        access with gaps.
>        * tree-vect-data-refs.c (vect_analyze_group_access): Mark the
>        loop as requiring an epilogue if there are gaps in the end of
>        the strided group.
>
> testsuite/ChangeLog:
>
>        PR tree-optimization/49038
>        * gcc.dg/vect/vect-strided-u8-i8-gap4-unknown.c: New test.

This test fails at run-time at random on Linux/ia32:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=49239

Patch

Index: tree-vect-loop-manip.c
===================================================================
--- tree-vect-loop-manip.c	(revision 174264)
+++ tree-vect-loop-manip.c	(working copy)
@@ -1551,7 +1551,7 @@  vect_generate_tmps_on_preheader (loop_vec_info loo
   edge pe;
   basic_block new_bb;
   gimple_seq stmts;
-  tree ni_name;
+  tree ni_name, ni_minus_gap_name;
   tree var;
   tree ratio_name;
   tree ratio_mult_vf_name;
@@ -1568,9 +1568,39 @@  vect_generate_tmps_on_preheader (loop_vec_info loo
   ni_name = vect_build_loop_niters (loop_vinfo, cond_expr_stmt_list);
   log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf));
 
+  /* If epilogue loop is required because of data accesses with gaps, we
+     subtract one iteration from the total number of iterations here for
+     correct calculation of RATIO.  */
+  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+    {
+      ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
+				       ni_name,
+			               build_one_cst (TREE_TYPE (ni_name)));
+      if (!is_gimple_val (ni_minus_gap_name))
+	{
+	  var = create_tmp_var (TREE_TYPE (ni), "ni_gap");
+          add_referenced_var (var);
+
+          stmts = NULL;
+          ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
+						    true, var);
+          if (cond_expr_stmt_list)
+            gimple_seq_add_seq (&cond_expr_stmt_list, stmts);
+          else
+            {
+              pe = loop_preheader_edge (loop);
+              new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+              gcc_assert (!new_bb);
+            }
+        }
+    }
+  else
+    ni_minus_gap_name = ni_name;
+
   /* Create: ratio = ni >> log2(vf) */
 
-  ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf);
+  ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_minus_gap_name),
+			    ni_minus_gap_name, log_vf);
   if (!is_gimple_val (ratio_name))
     {
       var = create_tmp_var (TREE_TYPE (ni), "bnd");
Index: testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4-unknown.c
===================================================================
--- testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4-unknown.c	(revision 0)
+++ testsuite/gcc.dg/vect/vect-strided-u8-i8-gap4-unknown.c	(revision 0)
@@ -0,0 +1,103 @@ 
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 160 
+
+typedef struct {
+   unsigned char a;
+   unsigned char b;
+   unsigned char c;
+   unsigned char d;
+   unsigned char e;
+   unsigned char f;
+   unsigned char g;
+   unsigned char h;
+} s;
+
+__attribute__ ((noinline)) int
+main1 (s *arr, int n)
+{
+  int i;
+  s *ptr = arr;
+  s res[N];
+  unsigned char x;
+
+  /* Check peeling for gaps for unknown loop bound.  */
+  for (i = 0; i < n; i++)
+    {
+      res[i].c = ptr->b + ptr->c;
+      x = ptr->c + ptr->f;
+      res[i].a = x + ptr->b;
+      res[i].d = ptr->b + ptr->c;
+      res[i].b = ptr->c;
+      res[i].f = ptr->f + ptr->e;
+      res[i].e = ptr->b + ptr->e; 
+      res[i].h = ptr->c;   
+      res[i].g = ptr->b + ptr->c;
+      ptr++; 
+    } 
+   
+  /* check results:  */
+  for (i = 0; i < n; i++)
+    { 
+      if (res[i].c != arr[i].b + arr[i].c
+          || res[i].a != arr[i].c + arr[i].f + arr[i].b
+          || res[i].d != arr[i].b + arr[i].c
+          || res[i].b != arr[i].c
+          || res[i].f != arr[i].f + arr[i].e
+          || res[i].e != arr[i].b + arr[i].e
+          || res[i].h != arr[i].c
+          || res[i].g != arr[i].b + arr[i].c)
+        abort ();
+   }
+
+  /* Check also that we don't do more iterations than needed.  */
+  for (i = n; i < N; i++)
+    {
+      if (res[i].c == arr[i].b + arr[i].c
+          || res[i].a == arr[i].c + arr[i].f + arr[i].b
+          || res[i].d == arr[i].b + arr[i].c
+          || res[i].b == arr[i].c
+          || res[i].f == arr[i].f + arr[i].e
+          || res[i].e == arr[i].b + arr[i].e
+          || res[i].h == arr[i].c
+          || res[i].g == arr[i].b + arr[i].c)
+        abort ();
+   }
+
+  return 0;
+}
+
+
+int main (void)
+{
+  int i;
+  s arr[N];
+  
+  check_vect ();
+
+  for (i = 0; i < N; i++)
+    { 
+      arr[i].a = 5;
+      arr[i].b = 6;
+      arr[i].c = 17;
+      arr[i].d = 3;
+      arr[i].e = 16;
+      arr[i].f = 16;
+      arr[i].g = 3;
+      arr[i].h = 56;
+      if (arr[i].a == 178)
+         abort(); 
+    } 
+
+  main1 (arr, N-2);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_strided8 } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+  
Index: testsuite/gcc.dg/vect/pr49038.c
===================================================================
--- testsuite/gcc.dg/vect/pr49038.c	(revision 0)
+++ testsuite/gcc.dg/vect/pr49038.c	(revision 0)
@@ -0,0 +1,38 @@ 
+#include <sys/mman.h>
+#include <stdio.h>
+
+#define COUNT 320
+#define MMAP_SIZE 0x10000
+#define ADDRESS 0x1122000000
+#define TYPE unsigned short
+
+void __attribute__((noinline))
+foo (TYPE *__restrict a, TYPE *__restrict b)
+{
+  int n;
+
+  for (n = 0; n < COUNT; n++)
+    a[n] = b[n * 2];
+}
+
+int
+main (void)
+{
+  void *x;
+  size_t b_offset;
+
+  x = mmap ((void *) ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE,
+	    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (x == MAP_FAILED)
+    {
+      perror ("mmap");
+      return 1;
+    }
+
+  b_offset = MMAP_SIZE - (2 * COUNT - 1) * sizeof (TYPE);
+  foo ((unsigned short *) x,
+       (unsigned short *) ((char *) x + b_offset));
+  return 0;
+}
+
+/* { dg-final { cleanup-tree-dump "vect" } } */
Index: tree-vectorizer.h
===================================================================
--- tree-vectorizer.h	(revision 174264)
+++ tree-vectorizer.h	(working copy)
@@ -255,6 +255,11 @@  typedef struct _loop_vec_info {
   /* Hash table used to choose the best peeling option.  */
   htab_t peeling_htab;
 
+  /* When we have strided data accesses with gaps, we may introduce invalid
+     memory accesses.  We peel the last iteration of the loop to prevent
+     this.  */
+  bool peeling_for_gaps;
+
 } *loop_vec_info;
 
 /* Access Functions.  */
@@ -283,6 +288,7 @@  typedef struct _loop_vec_info {
 #define LOOP_VINFO_REDUCTIONS(L)           (L)->reductions
 #define LOOP_VINFO_REDUCTION_CHAINS(L)     (L)->reduction_chains
 #define LOOP_VINFO_PEELING_HTAB(L)         (L)->peeling_htab
+#define LOOP_VINFO_PEELING_FOR_GAPS(L)     (L)->peeling_for_gaps
 
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
 VEC_length (gimple, (L)->may_misalign_stmts) > 0
Index: tree-vect-loop.c
===================================================================
--- tree-vect-loop.c	(revision 174264)
+++ tree-vect-loop.c	(working copy)
@@ -761,6 +761,7 @@  new_loop_vec_info (struct loop *loop)
   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
   LOOP_VINFO_PEELING_HTAB (res) = NULL;
+  LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 
   return res;
 }
@@ -2333,6 +2334,10 @@  vect_get_known_peeling_cost (loop_vec_info loop_vi
       peel_iters_prologue = niters < peel_iters_prologue ?
                             niters : peel_iters_prologue;
       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
+      /* If we need to peel for gaps, but no peeling is required, we have to
+	 peel VF iterations.  */
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
+        *peel_iters_epilogue = vf;
     }
 
    return (peel_iters_prologue * scalar_single_iter_cost)
@@ -4987,7 +4992,8 @@  vect_transform_loop (loop_vec_info loop_vinfo)
   do_peeling_for_loop_bound
     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-	   && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
+	   && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
+       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
 
   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
Index: tree-vect-data-refs.c
===================================================================
--- tree-vect-data-refs.c	(revision 174264)
+++ tree-vect-data-refs.c	(working copy)
@@ -2043,7 +2043,7 @@  vect_analyze_group_access (struct data_reference *
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
-  HOST_WIDE_INT stride;
+  HOST_WIDE_INT stride, last_accessed_element = 1;
   bool slp_impossible = false;
 
   /* For interleaving, STRIDE is STEP counted in elements, i.e., the size of the
@@ -2072,6 +2072,16 @@  vect_analyze_group_access (struct data_reference *
 	      fprintf (vect_dump, " step ");
 	      print_generic_expr (vect_dump, step, TDF_SLIM);
 	    }
+
+	  if (loop_vinfo)
+	    {
+	      LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
+
+	      if (vect_print_dump_info (REPORT_DETAILS))
+		fprintf (vect_dump, "Data access with gaps requires scalar "
+				    "epilogue loop");
+	    }
+
 	  return true;
 	}
 
@@ -2137,6 +2147,7 @@  vect_analyze_group_access (struct data_reference *
               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
               continue;
             }
+
           prev = next;
 
           /* Check that all the accesses have the same STEP.  */
@@ -2167,6 +2178,8 @@  vect_analyze_group_access (struct data_reference *
               gaps += diff - 1;
 	    }
 
+	  last_accessed_element += diff;
+
           /* Store the gap from the previous member of the group. If there is no
              gap in the access, GROUP_GAP is always 1.  */
           GROUP_GAP (vinfo_for_stmt (next)) = diff;
@@ -2245,6 +2258,15 @@  vect_analyze_group_access (struct data_reference *
             VEC_safe_push (gimple, heap, BB_VINFO_STRIDED_STORES (bb_vinfo),
                            stmt);
         }
+
+      /* There is a gap in the end of the group.  */
+      if (stride - last_accessed_element > 0 && loop_vinfo)
+	{
+	  LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
+	  if (vect_print_dump_info (REPORT_DETAILS))
+	    fprintf (vect_dump, "Data access with gaps requires scalar "
+				"epilogue loop");
+	}
     }
 
   return true;