Patchwork Fix PR tree-optimization/45752

login
register
mail settings
Submitter Ira Rosen
Date Oct. 5, 2010, 12:02 p.m.
Message ID <OF68484037.2A6E3403-ONC22577B3.003FF33D-C22577B3.004224C6@il.ibm.com>
Download mbox | patch
Permalink /patch/66821/
State New
Headers show

Comments

Ira Rosen - Oct. 5, 2010, 12:02 p.m.
Hi,

In function vect_get_mask_element several variables were declared as
static, which is problematic when there are more than one SLP instances in
a loop.

Both 4.5 and 4.6 patches were bootstrapped and tested on x86_64-suse-linux.

Applied to trunk. OK for 4.5?

Thanks,
Ira


4.5/4.6
ChangeLog:

	PR tree-optimization/45752
	* tree-vect-slp.c (vect_get_mask_element): Remove static
	variables, make them function arguments.
	(vect_transform_slp_perm_load): Pass new arguments to
	vect_get_mask_element.

testsuite/ChangeLog:

	PR tree-optimization/45752
	* gcc.dg/vect/pr45752.c: New test.


4.6 patch:

in
      the next vector as well.  */
@@ -1745,7 +1743,7 @@ vect_get_mask_element (gimple stmt, int
   /* The mask requires the next vector.  */
   if (*current_mask_element >= mask_nunits * 2)
     {
-      if (needs_first_vector || mask_fixed)
+      if (*needs_first_vector || *mask_fixed)
         {
           /* We either need the first vector too or have already moved to
the
              next vector. In both cases, this permutation needs three
@@ -1763,23 +1761,23 @@ vect_get_mask_element (gimple stmt, int
       /* We move to the next vector, dropping the first one and working
with
          the second and the third - we need to adjust the values of the
mask
          accordingly.  */
-      *current_mask_element -= mask_nunits * number_of_mask_fixes;
+      *current_mask_element -= mask_nunits * *number_of_mask_fixes;

       for (i = 0; i < index; i++)
-        mask[i] -= mask_nunits * number_of_mask_fixes;
+        mask[i] -= mask_nunits * *number_of_mask_fixes;

-      (number_of_mask_fixes)++;
-      mask_fixed = true;
+      (*number_of_mask_fixes)++;
+      *mask_fixed = true;
     }

-  *need_next_vector = mask_fixed;
+  *need_next_vector = *mask_fixed;

   /* This was the last element of this mask. Start a new one.  */
   if (index == mask_nunits - 1)
     {
-      number_of_mask_fixes = 1;
-      mask_fixed = false;
-      needs_first_vector = false;
+      *number_of_mask_fixes = 1;
+      *mask_fixed = false;
+      *needs_first_vector = false;
     }

   return true;
@@ -1805,6 +1803,9 @@ vect_transform_slp_perm_load (gimple stm
   int index, unroll_factor, *mask, current_mask_element, ncopies;
   bool only_one_vec = false, need_next_vector = false;
   int first_vec_index, second_vec_index, orig_vec_stmts_num,
vect_stmts_counter;
+  int number_of_mask_fixes = 1;
+  bool mask_fixed = false;
+  bool needs_first_vector = false;

   if (!targetm.vectorize.builtin_vec_perm)
     {
@@ -1891,7 +1892,9 @@ vect_transform_slp_perm_load (gimple stm
                 {
                   if (!vect_get_mask_element (stmt, first_mask_element, m,
                                    mask_nunits, only_one_vec, index, mask,
-                                   &current_mask_element,
&need_next_vector))
+                                   &current_mask_element,
&need_next_vector,
+                                   &number_of_mask_fixes, &mask_fixed,
+                                   &needs_first_vector))
                     return false;

                   mask[index++] = current_mask_element;
Richard Guenther - Oct. 5, 2010, 12:12 p.m.
On Tue, Oct 5, 2010 at 2:02 PM, Ira Rosen <IRAR@il.ibm.com> wrote:
>
> Hi,
>
> In function vect_get_mask_element several variables were declared as
> static, which is problematic when there are more than one SLP instances in
> a loop.
>
> Both 4.5 and 4.6 patches were bootstrapped and tested on x86_64-suse-linux.
>
> Applied to trunk. OK for 4.5?

Ok.

Thanks,
Richard.

> Thanks,
> Ira
>
>
> 4.5/4.6
> ChangeLog:
>
>        PR tree-optimization/45752
>        * tree-vect-slp.c (vect_get_mask_element): Remove static
>        variables, make them function arguments.
>        (vect_transform_slp_perm_load): Pass new arguments to
>        vect_get_mask_element.
>
> testsuite/ChangeLog:
>
>        PR tree-optimization/45752
>        * gcc.dg/vect/pr45752.c: New test.
>
>
> 4.6 patch:
>
> Index: testsuite/gcc.dg/vect/pr45752.c
> ===================================================================
> --- testsuite/gcc.dg/vect/pr45752.c     (revision 0)
> +++ testsuite/gcc.dg/vect/pr45752.c     (revision 0)
> @@ -0,0 +1,109 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdarg.h>
> +#include <stdio.h>
> +#include "tree-vect.h"
> +
> +#define M00 100
> +#define M10 216
> +#define M20 23
> +#define M30 237
> +#define M40 437
> +
> +#define M01 1322
> +#define M11 13
> +#define M21 27271
> +#define M31 2280
> +#define M41 284
> +
> +#define M02 74
> +#define M12 191
> +#define M22 500
> +#define M32 111
> +#define M42 1114
> +
> +#define M03 134
> +#define M13 117
> +#define M23 11
> +#define M33 771
> +#define M43 71
> +
> +#define M04 334
> +#define M14 147
> +#define M24 115
> +#define M34 7716
> +#define M44 16
> +
> +#define N 16
> +
> +void foo (unsigned int *__restrict__ pInput,
> +          unsigned int *__restrict__ pOutput,
> +          unsigned int *__restrict__ pInput2,
> +          unsigned int *__restrict__ pOutput2)
> +{
> +  unsigned int i, a, b, c, d, e;
> +
> +  for (i = 0; i < N / 5; i++)
> +    {
> +       a = *pInput++;
> +       b = *pInput++;
> +       c = *pInput++;
> +       d = *pInput++;
> +       e = *pInput++;
> +
> +       *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
> +       *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
> +       *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
> +       *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
> +       *pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
> +
> +
> +       a = *pInput2++;
> +       b = *pInput2++;
> +       c = *pInput2++;
> +       d = *pInput2++;
> +       e = *pInput2++;
> +
> +       *pOutput2++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
> +       *pOutput2++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
> +       *pOutput2++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
> +       *pOutput2++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
> +       *pOutput2++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
> +
> +    }
> +}
> +
> +int main (int argc, const char* argv[])
> +{
> +  unsigned int input[N], output[N], i, input2[N], output2[N];
> +  unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028,
> +    4754, 168364, 91254, 12399, 22848, 8174, 307964, 146829, 22009, 0};
> +  unsigned int check_results2[N] = {7136, 2702, 84604, 57909, 6633, 16956,
> +    6122, 224204, 113484, 16243, 26776, 9542, 363804, 169059, 25853, 0};
> +
> +  check_vect ();
> +
> +  for (i = 0; i < N; i++)
> +    {
> +      input[i] = i%256;
> +      input2[i] = i + 2;
> +      output[i] = 0;
> +      output2[i] = 0;
> +      __asm__ volatile ("");
> +    }
> +
> +  foo (input, output, input2, output2);
> +
> +  for (i = 0; i < N; i++)
> +    if (output[i] != check_results[i]
> +        || output2[i] != check_results2[i])
> +      abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect"  } } */
> +/* { dg-final { scan-tree-dump-times "permutation requires at least three
> vectors" 2 "vect" { target vect_perm } } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0
> "vect"  } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
> +
> Index: tree-vect-slp.c
> ===================================================================
> --- tree-vect-slp.c     (revision 164986)
> +++ tree-vect-slp.c     (working copy)
> @@ -2177,20 +2177,18 @@ static bool
>  vect_get_mask_element (gimple stmt, int first_mask_element, int m,
>                        int mask_nunits, bool only_one_vec, int index,
>                        int *mask, int *current_mask_element,
> -                       bool *need_next_vector)
> +                       bool *need_next_vector, int *number_of_mask_fixes,
> +                       bool *mask_fixed, bool *needs_first_vector)
>  {
>   int i;
> -  static int number_of_mask_fixes = 1;
> -  static bool mask_fixed = false;
> -  static bool needs_first_vector = false;
>
>   /* Convert to target specific representation.  */
>   *current_mask_element = first_mask_element + m;
>   /* Adjust the value in case it's a mask for second and third vectors.
> */
> -  *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
> +  *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
>
>   if (*current_mask_element < mask_nunits)
> -    needs_first_vector = true;
> +    *needs_first_vector = true;
>
>   /* We have only one input vector to permute but the mask accesses values
> in
>      the next vector as well.  */
> @@ -2208,7 +2206,7 @@ vect_get_mask_element (gimple stmt, int
>   /* The mask requires the next vector.  */
>   if (*current_mask_element >= mask_nunits * 2)
>     {
> -      if (needs_first_vector || mask_fixed)
> +      if (*needs_first_vector || *mask_fixed)
>         {
>           /* We either need the first vector too or have already moved to
> the
>              next vector. In both cases, this permutation needs three
> @@ -2226,23 +2224,23 @@ vect_get_mask_element (gimple stmt, int
>       /* We move to the next vector, dropping the first one and working
> with
>          the second and the third - we need to adjust the values of the
> mask
>          accordingly.  */
> -      *current_mask_element -= mask_nunits * number_of_mask_fixes;
> +      *current_mask_element -= mask_nunits * *number_of_mask_fixes;
>
>       for (i = 0; i < index; i++)
> -        mask[i] -= mask_nunits * number_of_mask_fixes;
> +        mask[i] -= mask_nunits * *number_of_mask_fixes;
>
> -      (number_of_mask_fixes)++;
> -      mask_fixed = true;
> +      (*number_of_mask_fixes)++;
> +      *mask_fixed = true;
>     }
>
> -  *need_next_vector = mask_fixed;
> +  *need_next_vector = *mask_fixed;
>
>   /* This was the last element of this mask. Start a new one.  */
>   if (index == mask_nunits - 1)
>     {
> -      number_of_mask_fixes = 1;
> -      mask_fixed = false;
> -      needs_first_vector = false;
> +      *number_of_mask_fixes = 1;
> +      *mask_fixed = false;
> +      *needs_first_vector = false;
>     }
>
>   return true;
> @@ -2268,6 +2266,9 @@ vect_transform_slp_perm_load (gimple stm
>   int index, unroll_factor, *mask, current_mask_element, ncopies;
>   bool only_one_vec = false, need_next_vector = false;
>   int first_vec_index, second_vec_index, orig_vec_stmts_num,
> vect_stmts_counter;
> +  int number_of_mask_fixes = 1;
> +  bool mask_fixed = false;
> +  bool needs_first_vector = false;
>
>   if (!targetm.vectorize.builtin_vec_perm)
>     {
> @@ -2351,7 +2352,9 @@ vect_transform_slp_perm_load (gimple stm
>                 {
>                   if (!vect_get_mask_element (stmt, first_mask_element, m,
>                                    mask_nunits, only_one_vec, index, mask,
> -                                   &current_mask_element,
> &need_next_vector))
> +                                   &current_mask_element,
> &need_next_vector,
> +                                   &number_of_mask_fixes, &mask_fixed,
> +                                   &needs_first_vector))
>                     return false;
>
>                   mask[index++] = current_mask_element;
>
> 4.5 patch:
>
> Index: testsuite/gcc.dg/vect/pr45752.c
> ===================================================================
> --- testsuite/gcc.dg/vect/pr45752.c     (revision 0)
> +++ testsuite/gcc.dg/vect/pr45752.c     (revision 0)
> @@ -0,0 +1,109 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdarg.h>
> +#include <stdio.h>
> +#include "tree-vect.h"
> +
> +#define M00 100
> +#define M10 216
> +#define M20 23
> +#define M30 237
> +#define M40 437
> +
> +#define M01 1322
> +#define M11 13
> +#define M21 27271
> +#define M31 2280
> +#define M41 284
> +
> +#define M02 74
> +#define M12 191
> +#define M22 500
> +#define M32 111
> +#define M42 1114
> +
> +#define M03 134
> +#define M13 117
> +#define M23 11
> +#define M33 771
> +#define M43 71
> +
> +#define M04 334
> +#define M14 147
> +#define M24 115
> +#define M34 7716
> +#define M44 16
> +
> +#define N 16
> +
> +void foo (unsigned int *__restrict__ pInput,
> +          unsigned int *__restrict__ pOutput,
> +          unsigned int *__restrict__ pInput2,
> +          unsigned int *__restrict__ pOutput2)
> +{
> +  unsigned int i, a, b, c, d, e;
> +
> +  for (i = 0; i < N / 5; i++)
> +    {
> +       a = *pInput++;
> +       b = *pInput++;
> +       c = *pInput++;
> +       d = *pInput++;
> +       e = *pInput++;
> +
> +       *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
> +       *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
> +       *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
> +       *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
> +       *pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
> +
> +
> +       a = *pInput2++;
> +       b = *pInput2++;
> +       c = *pInput2++;
> +       d = *pInput2++;
> +       e = *pInput2++;
> +
> +       *pOutput2++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
> +       *pOutput2++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
> +       *pOutput2++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
> +       *pOutput2++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
> +       *pOutput2++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
> +
> +    }
> +}
> +
> +int main (int argc, const char* argv[])
> +{
> +  unsigned int input[N], output[N], i, input2[N], output2[N];
> +  unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028,
> +    4754, 168364, 91254, 12399, 22848, 8174, 307964, 146829, 22009, 0};
> +  unsigned int check_results2[N] = {7136, 2702, 84604, 57909, 6633, 16956,
> +    6122, 224204, 113484, 16243, 26776, 9542, 363804, 169059, 25853, 0};
> +
> +  check_vect ();
> +
> +  for (i = 0; i < N; i++)
> +    {
> +      input[i] = i%256;
> +      input2[i] = i + 2;
> +      output[i] = 0;
> +      output2[i] = 0;
> +      __asm__ volatile ("");
> +    }
> +
> +  foo (input, output, input2, output2);
> +
> +  for (i = 0; i < N; i++)
> +    if (output[i] != check_results[i]
> +        || output2[i] != check_results2[i])
> +      abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect"  } } */
> +/* { dg-final { scan-tree-dump-times "permutation requires at least three
> vectors" 2 "vect" { target vect_perm } } } */
> +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0
> "vect"  } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
> +
> Index: tree-vect-slp.c
> ===================================================================
> --- tree-vect-slp.c     (revision 164986)
> +++ tree-vect-slp.c     (working copy)
> @@ -1714,20 +1714,18 @@ static bool
>  vect_get_mask_element (gimple stmt, int first_mask_element, int m,
>                        int mask_nunits, bool only_one_vec, int index,
>                        int *mask, int *current_mask_element,
> -                       bool *need_next_vector)
> +                       bool *need_next_vector, int *number_of_mask_fixes,
> +                       bool *mask_fixed, bool *needs_first_vector)
>  {
>   int i;
> -  static int number_of_mask_fixes = 1;
> -  static bool mask_fixed = false;
> -  static bool needs_first_vector = false;
>
>   /* Convert to target specific representation.  */
>   *current_mask_element = first_mask_element + m;
>   /* Adjust the value in case it's a mask for second and third vectors.
> */
> -  *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
> +  *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
>
>   if (*current_mask_element < mask_nunits)
> -    needs_first_vector = true;
> +    *needs_first_vector = true;
>
>   /* We have only one input vector to permute but the mask accesses values
> in
>      the next vector as well.  */
> @@ -1745,7 +1743,7 @@ vect_get_mask_element (gimple stmt, int
>   /* The mask requires the next vector.  */
>   if (*current_mask_element >= mask_nunits * 2)
>     {
> -      if (needs_first_vector || mask_fixed)
> +      if (*needs_first_vector || *mask_fixed)
>         {
>           /* We either need the first vector too or have already moved to
> the
>              next vector. In both cases, this permutation needs three
> @@ -1763,23 +1761,23 @@ vect_get_mask_element (gimple stmt, int
>       /* We move to the next vector, dropping the first one and working
> with
>          the second and the third - we need to adjust the values of the
> mask
>          accordingly.  */
> -      *current_mask_element -= mask_nunits * number_of_mask_fixes;
> +      *current_mask_element -= mask_nunits * *number_of_mask_fixes;
>
>       for (i = 0; i < index; i++)
> -        mask[i] -= mask_nunits * number_of_mask_fixes;
> +        mask[i] -= mask_nunits * *number_of_mask_fixes;
>
> -      (number_of_mask_fixes)++;
> -      mask_fixed = true;
> +      (*number_of_mask_fixes)++;
> +      *mask_fixed = true;
>     }
>
> -  *need_next_vector = mask_fixed;
> +  *need_next_vector = *mask_fixed;
>
>   /* This was the last element of this mask. Start a new one.  */
>   if (index == mask_nunits - 1)
>     {
> -      number_of_mask_fixes = 1;
> -      mask_fixed = false;
> -      needs_first_vector = false;
> +      *number_of_mask_fixes = 1;
> +      *mask_fixed = false;
> +      *needs_first_vector = false;
>     }
>
>   return true;
> @@ -1805,6 +1803,9 @@ vect_transform_slp_perm_load (gimple stm
>   int index, unroll_factor, *mask, current_mask_element, ncopies;
>   bool only_one_vec = false, need_next_vector = false;
>   int first_vec_index, second_vec_index, orig_vec_stmts_num,
> vect_stmts_counter;
> +  int number_of_mask_fixes = 1;
> +  bool mask_fixed = false;
> +  bool needs_first_vector = false;
>
>   if (!targetm.vectorize.builtin_vec_perm)
>     {
> @@ -1891,7 +1892,9 @@ vect_transform_slp_perm_load (gimple stm
>                 {
>                   if (!vect_get_mask_element (stmt, first_mask_element, m,
>                                    mask_nunits, only_one_vec, index, mask,
> -                                   &current_mask_element,
> &need_next_vector))
> +                                   &current_mask_element,
> &need_next_vector,
> +                                   &number_of_mask_fixes, &mask_fixed,
> +                                   &needs_first_vector))
>                     return false;
>
>                   mask[index++] = current_mask_element;
>
>

Patch

Index: testsuite/gcc.dg/vect/pr45752.c
===================================================================
--- testsuite/gcc.dg/vect/pr45752.c     (revision 0)
+++ testsuite/gcc.dg/vect/pr45752.c     (revision 0)
@@ -0,0 +1,109 @@ 
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define M00 100
+#define M10 216
+#define M20 23
+#define M30 237
+#define M40 437
+
+#define M01 1322
+#define M11 13
+#define M21 27271
+#define M31 2280
+#define M41 284
+
+#define M02 74
+#define M12 191
+#define M22 500
+#define M32 111
+#define M42 1114
+
+#define M03 134
+#define M13 117
+#define M23 11
+#define M33 771
+#define M43 71
+
+#define M04 334
+#define M14 147
+#define M24 115
+#define M34 7716
+#define M44 16
+
+#define N 16
+
+void foo (unsigned int *__restrict__ pInput,
+          unsigned int *__restrict__ pOutput,
+          unsigned int *__restrict__ pInput2,
+          unsigned int *__restrict__ pOutput2)
+{
+  unsigned int i, a, b, c, d, e;
+
+  for (i = 0; i < N / 5; i++)
+    {
+       a = *pInput++;
+       b = *pInput++;
+       c = *pInput++;
+       d = *pInput++;
+       e = *pInput++;
+
+       *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
+       *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
+       *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
+       *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
+       *pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
+
+
+       a = *pInput2++;
+       b = *pInput2++;
+       c = *pInput2++;
+       d = *pInput2++;
+       e = *pInput2++;
+
+       *pOutput2++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
+       *pOutput2++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
+       *pOutput2++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
+       *pOutput2++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
+       *pOutput2++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
+
+    }
+}
+
+int main (int argc, const char* argv[])
+{
+  unsigned int input[N], output[N], i, input2[N], output2[N];
+  unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028,
+    4754, 168364, 91254, 12399, 22848, 8174, 307964, 146829, 22009, 0};
+  unsigned int check_results2[N] = {7136, 2702, 84604, 57909, 6633, 16956,
+    6122, 224204, 113484, 16243, 26776, 9542, 363804, 169059, 25853, 0};
+
+  check_vect ();
+
+  for (i = 0; i < N; i++)
+    {
+      input[i] = i%256;
+      input2[i] = i + 2;
+      output[i] = 0;
+      output2[i] = 0;
+      __asm__ volatile ("");
+    }
+
+  foo (input, output, input2, output2);
+
+  for (i = 0; i < N; i++)
+    if (output[i] != check_results[i]
+        || output2[i] != check_results2[i])
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect"  } } */
+/* { dg-final { scan-tree-dump-times "permutation requires at least three
vectors" 2 "vect" { target vect_perm } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0
"vect"  } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
Index: tree-vect-slp.c
===================================================================
--- tree-vect-slp.c     (revision 164986)
+++ tree-vect-slp.c     (working copy)
@@ -2177,20 +2177,18 @@  static bool
 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
                        int mask_nunits, bool only_one_vec, int index,
                        int *mask, int *current_mask_element,
-                       bool *need_next_vector)
+                       bool *need_next_vector, int *number_of_mask_fixes,
+                       bool *mask_fixed, bool *needs_first_vector)
 {
   int i;
-  static int number_of_mask_fixes = 1;
-  static bool mask_fixed = false;
-  static bool needs_first_vector = false;

   /* Convert to target specific representation.  */
   *current_mask_element = first_mask_element + m;
   /* Adjust the value in case it's a mask for second and third vectors.
*/
-  *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
+  *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);

   if (*current_mask_element < mask_nunits)
-    needs_first_vector = true;
+    *needs_first_vector = true;

   /* We have only one input vector to permute but the mask accesses values
in
      the next vector as well.  */
@@ -2208,7 +2206,7 @@  vect_get_mask_element (gimple stmt, int
   /* The mask requires the next vector.  */
   if (*current_mask_element >= mask_nunits * 2)
     {
-      if (needs_first_vector || mask_fixed)
+      if (*needs_first_vector || *mask_fixed)
         {
           /* We either need the first vector too or have already moved to
the
              next vector. In both cases, this permutation needs three
@@ -2226,23 +2224,23 @@  vect_get_mask_element (gimple stmt, int
       /* We move to the next vector, dropping the first one and working
with
          the second and the third - we need to adjust the values of the
mask
          accordingly.  */
-      *current_mask_element -= mask_nunits * number_of_mask_fixes;
+      *current_mask_element -= mask_nunits * *number_of_mask_fixes;

       for (i = 0; i < index; i++)
-        mask[i] -= mask_nunits * number_of_mask_fixes;
+        mask[i] -= mask_nunits * *number_of_mask_fixes;

-      (number_of_mask_fixes)++;
-      mask_fixed = true;
+      (*number_of_mask_fixes)++;
+      *mask_fixed = true;
     }

-  *need_next_vector = mask_fixed;
+  *need_next_vector = *mask_fixed;

   /* This was the last element of this mask. Start a new one.  */
   if (index == mask_nunits - 1)
     {
-      number_of_mask_fixes = 1;
-      mask_fixed = false;
-      needs_first_vector = false;
+      *number_of_mask_fixes = 1;
+      *mask_fixed = false;
+      *needs_first_vector = false;
     }

   return true;
@@ -2268,6 +2266,9 @@  vect_transform_slp_perm_load (gimple stm
   int index, unroll_factor, *mask, current_mask_element, ncopies;
   bool only_one_vec = false, need_next_vector = false;
   int first_vec_index, second_vec_index, orig_vec_stmts_num,
vect_stmts_counter;
+  int number_of_mask_fixes = 1;
+  bool mask_fixed = false;
+  bool needs_first_vector = false;

   if (!targetm.vectorize.builtin_vec_perm)
     {
@@ -2351,7 +2352,9 @@  vect_transform_slp_perm_load (gimple stm
                 {
                   if (!vect_get_mask_element (stmt, first_mask_element, m,
                                    mask_nunits, only_one_vec, index, mask,
-                                   &current_mask_element,
&need_next_vector))
+                                   &current_mask_element,
&need_next_vector,
+                                   &number_of_mask_fixes, &mask_fixed,
+                                   &needs_first_vector))
                     return false;

                   mask[index++] = current_mask_element;

4.5 patch:

Index: testsuite/gcc.dg/vect/pr45752.c
===================================================================
--- testsuite/gcc.dg/vect/pr45752.c     (revision 0)
+++ testsuite/gcc.dg/vect/pr45752.c     (revision 0)
@@ -0,0 +1,109 @@ 
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define M00 100
+#define M10 216
+#define M20 23
+#define M30 237
+#define M40 437
+
+#define M01 1322
+#define M11 13
+#define M21 27271
+#define M31 2280
+#define M41 284
+
+#define M02 74
+#define M12 191
+#define M22 500
+#define M32 111
+#define M42 1114
+
+#define M03 134
+#define M13 117
+#define M23 11
+#define M33 771
+#define M43 71
+
+#define M04 334
+#define M14 147
+#define M24 115
+#define M34 7716
+#define M44 16
+
+#define N 16
+
+void foo (unsigned int *__restrict__ pInput,
+          unsigned int *__restrict__ pOutput,
+          unsigned int *__restrict__ pInput2,
+          unsigned int *__restrict__ pOutput2)
+{
+  unsigned int i, a, b, c, d, e;
+
+  for (i = 0; i < N / 5; i++)
+    {
+       a = *pInput++;
+       b = *pInput++;
+       c = *pInput++;
+       d = *pInput++;
+       e = *pInput++;
+
+       *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
+       *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
+       *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
+       *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
+       *pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
+
+
+       a = *pInput2++;
+       b = *pInput2++;
+       c = *pInput2++;
+       d = *pInput2++;
+       e = *pInput2++;
+
+       *pOutput2++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
+       *pOutput2++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
+       *pOutput2++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
+       *pOutput2++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
+       *pOutput2++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
+
+    }
+}
+
+int main (int argc, const char* argv[])
+{
+  unsigned int input[N], output[N], i, input2[N], output2[N];
+  unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028,
+    4754, 168364, 91254, 12399, 22848, 8174, 307964, 146829, 22009, 0};
+  unsigned int check_results2[N] = {7136, 2702, 84604, 57909, 6633, 16956,
+    6122, 224204, 113484, 16243, 26776, 9542, 363804, 169059, 25853, 0};
+
+  check_vect ();
+
+  for (i = 0; i < N; i++)
+    {
+      input[i] = i%256;
+      input2[i] = i + 2;
+      output[i] = 0;
+      output2[i] = 0;
+      __asm__ volatile ("");
+    }
+
+  foo (input, output, input2, output2);
+
+  for (i = 0; i < N; i++)
+    if (output[i] != check_results[i]
+        || output2[i] != check_results2[i])
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect"  } } */
+/* { dg-final { scan-tree-dump-times "permutation requires at least three
vectors" 2 "vect" { target vect_perm } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0
"vect"  } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
Index: tree-vect-slp.c
===================================================================
--- tree-vect-slp.c     (revision 164986)
+++ tree-vect-slp.c     (working copy)
@@ -1714,20 +1714,18 @@  static bool
 vect_get_mask_element (gimple stmt, int first_mask_element, int m,
                        int mask_nunits, bool only_one_vec, int index,
                        int *mask, int *current_mask_element,
-                       bool *need_next_vector)
+                       bool *need_next_vector, int *number_of_mask_fixes,
+                       bool *mask_fixed, bool *needs_first_vector)
 {
   int i;
-  static int number_of_mask_fixes = 1;
-  static bool mask_fixed = false;
-  static bool needs_first_vector = false;

   /* Convert to target specific representation.  */
   *current_mask_element = first_mask_element + m;
   /* Adjust the value in case it's a mask for second and third vectors.
*/
-  *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
+  *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);

   if (*current_mask_element < mask_nunits)
-    needs_first_vector = true;
+    *needs_first_vector = true;

   /* We have only one input vector to permute but the mask accesses values