diff mbox

Fix SLP vectorization of shifts (PR tree-optimization/48616)

Message ID 20110416061623.GU17079@tyan-ft48-01.lab.bos.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek April 16, 2011, 6:16 a.m. UTC
Hi!

As the attached testcase shows, while the current detection of what
shifts are by scalar and what shifts are by vector shift count
may work well for loop vectorizer (vect_internal_def being vector
shift, vect_external_def or vect_constant_def scalar shift),
it is incorrect for SLP, where vect_external_def and vect_constant_def
may be either scalar or vector shift (vect_external_def is e.g.
if def_stmt is in a different bb and either it can be the same
SSA_NAMEs in all shifts, or different, for vect_constant_def it
can be either the same constant in all cases, or different)
and in theory vect_internal_def could be also scalar shift (tried
to test that in fn3's first bb, though currently SLP doesn't attempt
to vectorize that, missed-optimization).

The following patch fixes that.  Bootstrapped/regtested on x86_64-linux
and i686-linux and additionally tested on the testcase with additional
-mxop.  Ok for trunk/4.6?

For better test coverage, perhaps the testcase should be duplicated
after effective target xop and xop_runtime are added and additionally
perhaps the testcase in gcc.dg/vect with -mxop testing what is SLP
vectorized from the dumps would be nice.

2011-04-16  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/48616
	* tree-vect-stmts.c (vectorizable_shift): If SLP, determine
	whether the shift is by scalar or vector based on whether all SLP
	scalar stmts have the same rhs.

	* gcc.dg/pr48616.c: New test.


	Jakub

Comments

Ira Rosen April 17, 2011, 8:30 a.m. UTC | #1
Jakub Jelinek <jakub@redhat.com> wrote on 16/04/2011 09:16:23 AM:
>
> Hi!
>
> As the attached testcase shows, while the current detection of what
> shifts are by scalar and what shifts are by vector shift count
> may work well for loop vectorizer (vect_internal_def being vector
> shift, vect_external_def or vect_constant_def scalar shift),
> it is incorrect for SLP, where vect_external_def and vect_constant_def
> may be either scalar or vector shift (vect_external_def is e.g.
> if def_stmt is in a different bb and either it can be the same
> SSA_NAMEs in all shifts, or different, for vect_constant_def it
> can be either the same constant in all cases, or different)
> and in theory vect_internal_def could be also scalar shift (tried
> to test that in fn3's first bb, though currently SLP doesn't attempt
> to vectorize that, missed-optimization).
>
> The following patch fixes that.  Bootstrapped/regtested on x86_64-linux
> and i686-linux and additionally tested on the testcase with additional
> -mxop.  Ok for trunk/4.6?
>
> For better test coverage, perhaps the testcase should be duplicated
> after effective target xop and xop_runtime are added and additionally
> perhaps the testcase in gcc.dg/vect with -mxop testing what is SLP
> vectorized from the dumps would be nice.
>
> 2011-04-16  Jakub Jelinek  <jakub@redhat.com>
>
>    PR tree-optimization/48616
>    * tree-vect-stmts.c (vectorizable_shift): If SLP, determine
>    whether the shift is by scalar or vector based on whether all SLP
>    scalar stmts have the same rhs.
>
>    * gcc.dg/pr48616.c: New test.
>
> --- gcc/tree-vect-stmts.c.jj   2011-03-14 14:12:15.000000000 +0100
> +++ gcc/tree-vect-stmts.c   2011-04-15 20:44:30.000000000 +0200
> @@ -2077,7 +2077,7 @@ vectorizable_shift (gimple stmt, gimple_
>    VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
>    tree vop0, vop1;
>    unsigned int k;
> -  bool scalar_shift_arg = false;
> +  bool scalar_shift_arg = true;
>    bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
>    int vf;
>
> @@ -2159,8 +2159,34 @@ vectorizable_shift (gimple stmt, gimple_
>    /* Determine whether the shift amount is a vector, or scalar.  If the
>       shift/rotate amount is a vector, use the vector/vector shift
optabs.  */
>
> +  if (dt[1] == vect_internal_def && !slp_node)
> +    scalar_shift_arg = false;
> +  else if (dt[1] == vect_constant_def
> +      || dt[1] == vect_external_def
> +      || dt[1] == vect_internal_def)
> +    {
> +      /* In SLP, need to check whether the shift count is the same,
> +    in loops if it is a constant or invariant, it is always
> +    a scalar shift.  */
> +      if (slp_node)
> +   {
> +     VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
> +     gimple slpstmt;
> +
> +     FOR_EACH_VEC_ELT (gimple, stmts, k, slpstmt)
> +       if (!operand_equal_p (gimple_assign_rhs2 (slpstmt), op1, 0))
> +         scalar_shift_arg = false;

We already have this check in vect_build_slp_tree(). It didn't work for the
testcase because it doesn't take into account the type of definition. I
agree that it's better to move it here and base the vector/scalar  decision
on it, but please remove the now redundant check from vect_build_slp_tree
().

Otherwise, OK for trunk.

Thanks,
Ira

> +   }
> +    }
> +  else
> +    {
> +      if (vect_print_dump_info (REPORT_DETAILS))
> +   fprintf (vect_dump, "operand mode requires invariant argument.");
> +      return false;
> +    }
> +
>    /* Vector shifted by vector.  */
> -  if (dt[1] == vect_internal_def)
> +  if (!scalar_shift_arg)
>      {
>        optab = optab_for_tree_code (code, vectype, optab_vector);
>        if (vect_print_dump_info (REPORT_DETAILS))
> @@ -2168,13 +2194,12 @@ vectorizable_shift (gimple stmt, gimple_
>      }
>    /* See if the machine has a vector shifted by scalar insn and if not
>       then see if it has a vector shifted by vector insn.  */
> -  else if (dt[1] == vect_constant_def || dt[1] == vect_external_def)
> +  else
>      {
>        optab = optab_for_tree_code (code, vectype, optab_scalar);
>        if (optab
>            && optab_handler (optab, TYPE_MODE (vectype)) !=
CODE_FOR_nothing)
>          {
> -          scalar_shift_arg = true;
>            if (vect_print_dump_info (REPORT_DETAILS))
>              fprintf (vect_dump, "vector/scalar shift/rotate found.");
>          }
> @@ -2185,6 +2210,8 @@ vectorizable_shift (gimple stmt, gimple_
>                 && (optab_handler (optab, TYPE_MODE (vectype))
>                        != CODE_FOR_nothing))
>              {
> +         scalar_shift_arg = false;
> +
>                if (vect_print_dump_info (REPORT_DETAILS))
>                  fprintf (vect_dump, "vector/vector shift/rotate
found.");
>
> @@ -2197,12 +2224,6 @@ vectorizable_shift (gimple stmt, gimple_
>              }
>          }
>      }
> -  else
> -    {
> -      if (vect_print_dump_info (REPORT_DETAILS))
> -        fprintf (vect_dump, "operand mode requires invariant
argument.");
> -      return false;
> -    }
>
>    /* Supportable by target?  */
>    if (!optab)
> --- gcc/testsuite/gcc.dg/pr48616.c.jj   2011-04-15 21:02:27.000000000
+0200
> +++ gcc/testsuite/gcc.dg/pr48616.c   2011-04-15 21:00:24.000000000 +0200
> @@ -0,0 +1,134 @@
> +/* PR tree-optimization/48616 */
> +/* { dg-do run } */
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +
> +extern void abort (void);
> +int a[4] __attribute__((aligned (32)));
> +int b[4] __attribute__((aligned (32)));
> +int c[4] __attribute__((aligned (32)));
> +int d[4] __attribute__((aligned (32)));
> +int e[4] __attribute__((aligned (32)));
> +
> +__attribute__((noinline, noclone))
> +int
> +foo (int x)
> +{
> +  asm ("" : "+r" (x));
> +  return x;
> +}
> +
> +__attribute__((noinline, noclone))
> +void
> +fn1 (int i)
> +{
> +  a[0] = b[0] << c[0];
> +  a[1] = b[1] << c[1];
> +  a[2] = b[2] << c[2];
> +  a[3] = b[3] << c[3];
> +  if (i)
> +    {
> +      d[0] = e[0] >> c[0];
> +      d[1] = e[1] >> c[1];
> +      d[2] = e[2] >> c[2];
> +      d[3] = e[3] >> c[3];
> +    }
> +}
> +
> +__attribute__((noinline, noclone))
> +void
> +fn2 (int i)
> +{
> +  a[0] = b[0] << 1;
> +  a[1] = b[1] << 2;
> +  a[2] = b[2] << 3;
> +  a[3] = b[3] << 4;
> +  if (i)
> +    {
> +      d[0] = e[0] >> 1;
> +      d[1] = e[1] >> 2;
> +      d[2] = e[2] >> 3;
> +      d[3] = e[3] >> 4;
> +    }
> +}
> +
> +__attribute__((noinline, noclone))
> +void
> +fn3 (int i, int j)
> +{
> +  int x = foo (j);
> +  a[0] = b[0] << x;
> +  a[1] = b[1] << x;
> +  a[2] = b[2] << x;
> +  a[3] = b[3] << x;
> +  if (i)
> +    {
> +      d[0] = e[0] >> x;
> +      d[1] = e[1] >> x;
> +      d[2] = e[2] >> x;
> +      d[3] = e[3] >> x;
> +    }
> +}
> +
> +__attribute__((noinline, noclone))
> +void
> +fn4 (int i)
> +{
> +  a[0] = b[0] << 1;
> +  a[1] = b[1] << 1;
> +  a[2] = b[2] << 1;
> +  a[3] = b[3] << 1;
> +  if (i)
> +    {
> +      d[0] = e[0] >> 1;
> +      d[1] = e[1] >> 1;
> +      d[2] = e[2] >> 1;
> +      d[3] = e[3] >> 1;
> +    }
> +}
> +
> +int
> +main ()
> +{
> +  int i;
> +  int *t;
> +  for (i = 0; i < 4; i++)
> +    {
> +      b[i] = 32;
> +      c[i] = i + 1;
> +      e[i] = 32;
> +    }
> +  asm volatile ("" : : "r" (b) : "memory");
> +  asm volatile ("" : : "r" (c) : "memory");
> +  asm volatile ("" : "=r" (t) : "0" (d) : "memory");
> +  fn1 (t != 0);
> +  for (i = 0; i < 4; i++)
> +    {
> +      if (a[i] != (32 << (i + 1)) || d[i] != (32 >> (i + 1)))
> +   abort ();
> +      a[i] = 0;
> +      d[i] = 0;
> +    }
> +  fn2 (t != 0);
> +  for (i = 0; i < 4; i++)
> +    {
> +      if (a[i] != (32 << (i + 1)) || d[i] != (32 >> (i + 1)))
> +   abort ();
> +      a[i] = 0;
> +      d[i] = 0;
> +    }
> +  fn3 (t != 0, t != 0);
> +  for (i = 0; i < 4; i++)
> +    {
> +      if (a[i] != (32 << 1) || d[i] != (32 >> 1))
> +   abort ();
> +      a[i] = 0;
> +      d[i] = 0;
> +    }
> +  fn4 (t != 0);
> +  for (i = 0; i < 4; i++)
> +    {
> +      if (a[i] != (32 << 1) || d[i] != (32 >> 1))
> +   abort ();
> +    }
> +  return 0;
> +}
>
>    Jakub
diff mbox

Patch

--- gcc/tree-vect-stmts.c.jj	2011-03-14 14:12:15.000000000 +0100
+++ gcc/tree-vect-stmts.c	2011-04-15 20:44:30.000000000 +0200
@@ -2077,7 +2077,7 @@  vectorizable_shift (gimple stmt, gimple_
   VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
   tree vop0, vop1;
   unsigned int k;
-  bool scalar_shift_arg = false;
+  bool scalar_shift_arg = true;
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   int vf;
 
@@ -2159,8 +2159,34 @@  vectorizable_shift (gimple stmt, gimple_
   /* Determine whether the shift amount is a vector, or scalar.  If the
      shift/rotate amount is a vector, use the vector/vector shift optabs.  */
 
+  if (dt[1] == vect_internal_def && !slp_node)
+    scalar_shift_arg = false;
+  else if (dt[1] == vect_constant_def
+	   || dt[1] == vect_external_def
+	   || dt[1] == vect_internal_def)
+    {
+      /* In SLP, need to check whether the shift count is the same,
+	 in loops if it is a constant or invariant, it is always
+	 a scalar shift.  */
+      if (slp_node)
+	{
+	  VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+	  gimple slpstmt;
+
+	  FOR_EACH_VEC_ELT (gimple, stmts, k, slpstmt)
+	    if (!operand_equal_p (gimple_assign_rhs2 (slpstmt), op1, 0))
+	      scalar_shift_arg = false;
+	}
+    }
+  else
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+	fprintf (vect_dump, "operand mode requires invariant argument.");
+      return false;
+    }
+
   /* Vector shifted by vector.  */
-  if (dt[1] == vect_internal_def)
+  if (!scalar_shift_arg)
     {
       optab = optab_for_tree_code (code, vectype, optab_vector);
       if (vect_print_dump_info (REPORT_DETAILS))
@@ -2168,13 +2194,12 @@  vectorizable_shift (gimple stmt, gimple_
     }
   /* See if the machine has a vector shifted by scalar insn and if not
      then see if it has a vector shifted by vector insn.  */
-  else if (dt[1] == vect_constant_def || dt[1] == vect_external_def)
+  else
     {
       optab = optab_for_tree_code (code, vectype, optab_scalar);
       if (optab
           && optab_handler (optab, TYPE_MODE (vectype)) != CODE_FOR_nothing)
         {
-          scalar_shift_arg = true;
           if (vect_print_dump_info (REPORT_DETAILS))
             fprintf (vect_dump, "vector/scalar shift/rotate found.");
         }
@@ -2185,6 +2210,8 @@  vectorizable_shift (gimple stmt, gimple_
                && (optab_handler (optab, TYPE_MODE (vectype))
                       != CODE_FOR_nothing))
             {
+	      scalar_shift_arg = false;
+
               if (vect_print_dump_info (REPORT_DETAILS))
                 fprintf (vect_dump, "vector/vector shift/rotate found.");
 
@@ -2197,12 +2224,6 @@  vectorizable_shift (gimple stmt, gimple_
             }
         }
     }
-  else
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "operand mode requires invariant argument.");
-      return false;
-    }
 
   /* Supportable by target?  */
   if (!optab)
--- gcc/testsuite/gcc.dg/pr48616.c.jj	2011-04-15 21:02:27.000000000 +0200
+++ gcc/testsuite/gcc.dg/pr48616.c	2011-04-15 21:00:24.000000000 +0200
@@ -0,0 +1,134 @@ 
+/* PR tree-optimization/48616 */
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+extern void abort (void);
+int a[4] __attribute__((aligned (32)));
+int b[4] __attribute__((aligned (32)));
+int c[4] __attribute__((aligned (32)));
+int d[4] __attribute__((aligned (32)));
+int e[4] __attribute__((aligned (32)));
+
+__attribute__((noinline, noclone))
+int
+foo (int x)
+{
+  asm ("" : "+r" (x));
+  return x;
+}
+
+__attribute__((noinline, noclone))
+void
+fn1 (int i)
+{
+  a[0] = b[0] << c[0];
+  a[1] = b[1] << c[1];
+  a[2] = b[2] << c[2];
+  a[3] = b[3] << c[3];
+  if (i)
+    {
+      d[0] = e[0] >> c[0];
+      d[1] = e[1] >> c[1];
+      d[2] = e[2] >> c[2];
+      d[3] = e[3] >> c[3];
+    }
+}
+
+__attribute__((noinline, noclone))
+void
+fn2 (int i)
+{
+  a[0] = b[0] << 1;
+  a[1] = b[1] << 2;
+  a[2] = b[2] << 3;
+  a[3] = b[3] << 4;
+  if (i)
+    {
+      d[0] = e[0] >> 1;
+      d[1] = e[1] >> 2;
+      d[2] = e[2] >> 3;
+      d[3] = e[3] >> 4;
+    }
+}
+
+__attribute__((noinline, noclone))
+void
+fn3 (int i, int j)
+{
+  int x = foo (j);
+  a[0] = b[0] << x;
+  a[1] = b[1] << x;
+  a[2] = b[2] << x;
+  a[3] = b[3] << x;
+  if (i)
+    {
+      d[0] = e[0] >> x;
+      d[1] = e[1] >> x;
+      d[2] = e[2] >> x;
+      d[3] = e[3] >> x;
+    }
+}
+
+__attribute__((noinline, noclone))
+void
+fn4 (int i)
+{
+  a[0] = b[0] << 1;
+  a[1] = b[1] << 1;
+  a[2] = b[2] << 1;
+  a[3] = b[3] << 1;
+  if (i)
+    {
+      d[0] = e[0] >> 1;
+      d[1] = e[1] >> 1;
+      d[2] = e[2] >> 1;
+      d[3] = e[3] >> 1;
+    }
+}
+
+int
+main ()
+{
+  int i;
+  int *t;
+  for (i = 0; i < 4; i++)
+    {
+      b[i] = 32;
+      c[i] = i + 1;
+      e[i] = 32;
+    }
+  asm volatile ("" : : "r" (b) : "memory");
+  asm volatile ("" : : "r" (c) : "memory");
+  asm volatile ("" : "=r" (t) : "0" (d) : "memory");
+  fn1 (t != 0);
+  for (i = 0; i < 4; i++)
+    {
+      if (a[i] != (32 << (i + 1)) || d[i] != (32 >> (i + 1)))
+	abort ();
+      a[i] = 0;
+      d[i] = 0;
+    }
+  fn2 (t != 0);
+  for (i = 0; i < 4; i++)
+    {
+      if (a[i] != (32 << (i + 1)) || d[i] != (32 >> (i + 1)))
+	abort ();
+      a[i] = 0;
+      d[i] = 0;
+    }
+  fn3 (t != 0, t != 0);
+  for (i = 0; i < 4; i++)
+    {
+      if (a[i] != (32 << 1) || d[i] != (32 >> 1))
+	abort ();
+      a[i] = 0;
+      d[i] = 0;
+    }
+  fn4 (t != 0);
+  for (i = 0; i < 4; i++)
+    {
+      if (a[i] != (32 << 1) || d[i] != (32 >> 1))
+	abort ();
+    }
+  return 0;
+}