Enhancing the widen-mult pattern in vectorization.

Hi

The current widen-mult pattern only considers two operands with the
same size. However, operands with different sizes can also benefit
from this pattern. The following loop shows such an example:


char a[N];
short b[N];
int c[N];

for (int i = 0; i < N; ++i)
  c[i] = a[i] * b[i];


In this case, we can convert a[i] into short type then perform
widen-mult on b[i] and the converted value:


for (int i = 0; i < N; ++i) {
  short t = a[i];
  c[i] = t w* b[i];
}


This patch adds such support. In addition, the following loop fails to
be recognized as a widen-mult pattern because the widening operation
from char to int is not directly supported by the target:


char a[N], b[N];
int c[N];

for (int i = 0; i < N; ++i)
  c[i] = a[i] * b[i];


In this case, we can still perform widen-mult on a[i] and b[i], and
get a result of short type, then convert it to int:


char a[N], b[N];
int c[N];

for (int i = 0; i < N; ++i) {
  short t = a[i] w* b[i];
  c[i] = (int) t;
}


Currently GCC does not allow multi-step conversions for binary
widening operations. This pattern removes this restriction and use
VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR to arrange data after the
widen-mult is performed for the widen-mult pattern. This can reduce
several unpacking instructions (for this example, the number of
packings/unpackings is reduced from 12 to 8. For SSE2, the inefficient
multiplication between two V4SI vectors can also be avoided).

Bootstrapped and tested on an x86_64 machine.



thanks,
Cong



   || (GET_MODE_SIZE (TYPE_MODE (lhs_type))
@@ -2787,6 +2782,15 @@ vectorizable_conversion (gimple stmt,
gimple_stmt_iterator *gsi,
   c1 = codecvt1;
   c2 = codecvt2;
  }
+      else if (op_type == binary_op && i < multi_step_cvt)
+ {
+  /* For binary widening operations, if multi-steps are needed,
+     use VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR to convert
+     intermediate result to final one.  */
+  c1 = VEC_UNPACK_LO_EXPR;
+  c2 = VEC_UNPACK_HI_EXPR;
+  op_type = unary_op;
+ }
       vect_create_vectorized_promotion_stmts (&vec_oprnds0,
       &vec_oprnds1,
       stmt, this_dest, gsi,
@@ -6510,7 +6514,7 @@ supportable_widening_operation (enum tree_code
code, gimple stmt,
   enum tree_code c1, c2;
   int i;
   tree prev_type, intermediate_type;
-  enum machine_mode intermediate_mode, prev_mode;
+  enum machine_mode intermediate_mode;
   optab optab3, optab4;

   *multi_step_cvt = 0;
@@ -6634,11 +6638,17 @@ supportable_widening_operation (enum tree_code
code, gimple stmt,
      types.  */

   prev_type = vectype;
-  prev_mode = vec_mode;

-  if (!CONVERT_EXPR_CODE_P (code))
+  /* For WIDEN_MULT_EXPR, it is possible that two steps are needed.  For
+     example, consider WIDEN_MULT_EXPR from char to int.  In the first step,
+     we can get the widen-mult result as short, and then widen it again to int.
+     VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR are used in the second step.  */
+  if (!CONVERT_EXPR_CODE_P (code) && code != WIDEN_MULT_EXPR)
     return false;

+  c1 = VEC_UNPACK_LO_EXPR;
+  c2 = VEC_UNPACK_HI_EXPR;
+
   /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
      intermediate steps in promotion sequence.  We try
      MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do
@@ -6654,10 +6664,6 @@ supportable_widening_operation (enum tree_code
code, gimple stmt,
       optab4 = optab_for_tree_code (c2, intermediate_type, optab_default);

       if (!optab3 || !optab4
-          || (icode1 = optab_handler (optab1, prev_mode)) == CODE_FOR_nothing
-  || insn_data[icode1].operand[0].mode != intermediate_mode
-  || (icode2 = optab_handler (optab2, prev_mode)) == CODE_FOR_nothing
-  || insn_data[icode2].operand[0].mode != intermediate_mode
   || ((icode1 = optab_handler (optab3, intermediate_mode))
       == CODE_FOR_nothing)
   || ((icode2 = optab_handler (optab4, intermediate_mode))
@@ -6672,7 +6678,6 @@ supportable_widening_operation (enum tree_code
code, gimple stmt,
  return true;

       prev_type = intermediate_type;
-      prev_mode = intermediate_mode;
     }

   interm_types->release ();

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index f298c0b..44ed204 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,12 @@
+2013-12-02  Cong Hou  <congh@google.com>
+
+	* tree-vect-patterns.c (vect_recog_widen_mult_pattern): Enhance
+	the widen-mult pattern by handling two operands with different
+	sizes.
+	* tree-vect-stmts.c (vectorizable_conversion): Allow multi-steps
+	conversions after widening mult operation.
+	(supportable_widening_operation): Likewise.
+
 2013-11-22  Jakub Jelinek  <jakub@redhat.com>
 
 	PR sanitizer/59061
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 12d2c90..611ae1c 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2013-12-02  Cong Hou  <congh@google.com>
+
+	* gcc.dg/vect/vect-widen-mult-u8-s16-s32.c: New test.
+	* gcc.dg/vect/vect-widen-mult-u8-u32.c: New test.
+
 2013-11-22  Jakub Jelinek  <jakub@redhat.com>
 
 	* c-c++-common/asan/no-redundant-instrumentation-7.c: Fix
diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
new file mode 100644
index 0000000..9f9081b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
@@ -0,0 +1,48 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int result[N];
+
+/* unsigned char * short -> int widening-mult.  */
+__attribute__ ((noinline)) int
+foo1(int len) {
+  int i;
+
+  for (i=0; i<len; i++) {
+    result[i] = X[i] * Y[i];
+  }
+}
+
+int main (void)
+{
+  int i;
+
+  check_vect ();
+
+  for (i=0; i<N; i++) {
+    X[i] = i;
+    Y[i] = 64-i;
+    __asm__ volatile ("");
+  }
+
+  foo1 (N);
+
+  for (i=0; i<N; i++) {
+    if (result[i] != X[i] * Y[i])
+      abort ();
+  }
+  
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
+/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
new file mode 100644
index 0000000..12c4692
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
@@ -0,0 +1,48 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+unsigned char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+unsigned int result[N];
+
+/* unsigned char-> unsigned int widening-mult.  */
+__attribute__ ((noinline)) int
+foo1(int len) {
+  int i;
+
+  for (i=0; i<len; i++) {
+    result[i] = X[i] * Y[i];
+  }
+}
+
+int main (void)
+{
+  int i;
+
+  check_vect ();
+
+  for (i=0; i<N; i++) {
+    X[i] = i;
+    Y[i] = 64-i;
+    __asm__ volatile ("");
+  }
+
+  foo1 (N);
+
+  for (i=0; i<N; i++) {
+    if (result[i] != X[i] * Y[i])
+      abort ();
+  }
+  
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_widen_mult_qi_to_hi || vect_unpack } } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" { target vect_widen_mult_qi_to_hi_pattern } } } */
+/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" { target vect_widen_mult_qi_to_hi_pattern } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 7823cc3..06279bc 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -529,7 +529,8 @@ vect_handle_widen_op_by_const (gimple stmt, enum tree_code code,
 
    Try to find the following pattern:
 
-     type a_t, b_t;
+     type1 a_t;
+     type2 b_t;
      TYPE a_T, b_T, prod_T;
 
      S1  a_t = ;
@@ -538,11 +539,12 @@ vect_handle_widen_op_by_const (gimple stmt, enum tree_code code,
      S4  b_T = (TYPE) b_t;
      S5  prod_T = a_T * b_T;
 
-   where type 'TYPE' is at least double the size of type 'type'.
+   where type 'TYPE' is at least double the size of type 'type1' and 'type2'.
 
    Also detect unsigned cases:
 
-     unsigned type a_t, b_t;
+     unsigned type1 a_t;
+     unsigned type2 b_t;
      unsigned TYPE u_prod_T;
      TYPE a_T, b_T, prod_T;
 
@@ -661,6 +663,50 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,
         return NULL;
     }
 
+  /* If the two arguments have different sizes, convert the one with
+     the smaller type into the larger type.  */
+  if (TYPE_PRECISION (half_type0) != TYPE_PRECISION (half_type1))
+    {
+      tree* oprnd = NULL;
+      gimple def_stmt = NULL;
+
+      if (TYPE_PRECISION (half_type0) < TYPE_PRECISION (half_type1))
+	{
+	  def_stmt = def_stmt0;
+	  half_type0 = half_type1;
+	  oprnd = &oprnd0;
+	}
+      else
+	{
+	  def_stmt = def_stmt1;
+	  half_type1 = half_type0;
+	  oprnd = &oprnd1;
+	}
+
+      if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)))
+	{
+	  gimple new_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
+	  /* Check if the already created pattern stmt is what we need.  */
+	  if (!is_gimple_assign (new_stmt)
+	      || gimple_assign_rhs_code (new_stmt) != NOP_EXPR
+	      || TREE_TYPE (gimple_assign_lhs (new_stmt)) != half_type0)
+	    return NULL;
+
+	  stmts->safe_push (def_stmt);
+	  *oprnd = gimple_assign_lhs (new_stmt);
+	}
+      else
+	{
+	  tree old_oprnd = gimple_assign_rhs1 (def_stmt);
+	  tree new_oprnd = make_ssa_name (half_type0, NULL);
+	  gimple new_stmt = gimple_build_assign_with_ops (NOP_EXPR, new_oprnd,
+							  old_oprnd, NULL_TREE);
+	  STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)) = new_stmt;
+	  stmts->safe_push (def_stmt);
+	  *oprnd = new_oprnd;
+	}
+    }
+
   /* Handle unsigned case.  Look for
      S6  u_prod_T = (unsigned TYPE) prod_T;
      Use unsigned TYPE as the type for WIDEN_MULT_EXPR.  */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 72dfacd..e1ca2a2 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -2504,12 +2504,7 @@ vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
       if (supportable_widening_operation (code, stmt, vectype_out, vectype_in,
 					  &code1, &code2, &multi_step_cvt,
 					  &interm_types))
-	{
-	  /* Binary widening operation can only be supported directly by the
-	     architecture.  */
-	  gcc_assert (!(multi_step_cvt && op_type == binary_op));
-	  break;
-	}
+	break;
 
       if (code != FLOAT_EXPR
 	  || (GET_MODE_SIZE (TYPE_MODE (lhs_type))
@@ -2787,6 +2782,15 @@ vectorizable_conversion (gimple stmt, gimple_stmt_iterator *gsi,
 		  c1 = codecvt1;
 		  c2 = codecvt2;
 		}
+	      else if (op_type == binary_op && i < multi_step_cvt)
+		{
+		  /* For binary widening operations, if multi-steps are needed,
+		     use VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR to convert
+		     intermediate result to final one.  */
+		  c1 = VEC_UNPACK_LO_EXPR;
+		  c2 = VEC_UNPACK_HI_EXPR;
+		  op_type = unary_op;
+		}
 	      vect_create_vectorized_promotion_stmts (&vec_oprnds0,
 						      &vec_oprnds1,
 						      stmt, this_dest, gsi,
@@ -6510,7 +6514,7 @@ supportable_widening_operation (enum tree_code code, gimple stmt,
   enum tree_code c1, c2;
   int i;
   tree prev_type, intermediate_type;
-  enum machine_mode intermediate_mode, prev_mode;
+  enum machine_mode intermediate_mode;
   optab optab3, optab4;
 
   *multi_step_cvt = 0;
@@ -6634,11 +6638,17 @@ supportable_widening_operation (enum tree_code code, gimple stmt,
      types.  */
 
   prev_type = vectype;
-  prev_mode = vec_mode;
 
-  if (!CONVERT_EXPR_CODE_P (code))
+  /* For WIDEN_MULT_EXPR, it is possible that two steps are needed.  For
+     example, consider WIDEN_MULT_EXPR from char to int.  In the first step,
+     we can get the widen-mult result as short, and then widen it again to int.
+     VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR are used in the second step.  */
+  if (!CONVERT_EXPR_CODE_P (code) && code != WIDEN_MULT_EXPR)
     return false;
 
+  c1 = VEC_UNPACK_LO_EXPR;
+  c2 = VEC_UNPACK_HI_EXPR;
+
   /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
      intermediate steps in promotion sequence.  We try
      MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do
@@ -6654,10 +6664,6 @@ supportable_widening_operation (enum tree_code code, gimple stmt,
       optab4 = optab_for_tree_code (c2, intermediate_type, optab_default);
 
       if (!optab3 || !optab4
-          || (icode1 = optab_handler (optab1, prev_mode)) == CODE_FOR_nothing
-	  || insn_data[icode1].operand[0].mode != intermediate_mode
-	  || (icode2 = optab_handler (optab2, prev_mode)) == CODE_FOR_nothing
-	  || insn_data[icode2].operand[0].mode != intermediate_mode
 	  || ((icode1 = optab_handler (optab3, intermediate_mode))
 	      == CODE_FOR_nothing)
 	  || ((icode2 = optab_handler (optab4, intermediate_mode))
@@ -6672,7 +6678,6 @@ supportable_widening_operation (enum tree_code code, gimple stmt,
 	return true;
 
       prev_type = intermediate_type;
-      prev_mode = intermediate_mode;
     }
 
   interm_types->release ();


Enhancing the widen-mult pattern in vectorization.

Commit Message

Patch