Patchwork Enhancing the widen-mult pattern in vectorization.

login
register
mail settings
Submitter Cong Hou
Date Dec. 6, 2013, 11:45 p.m.
Message ID <CAK=A3=0CRhLOXeJt9Q=99z6wvg=LCZTOBtWjc50dtPy_Bz85YQ@mail.gmail.com>
Download mbox | patch
Permalink /patch/298588/
State New
Headers show

Comments

Cong Hou - Dec. 6, 2013, 11:45 p.m.
After further reviewing this patch, I found I don't have to change the
code in tree-vect-stmts.c to allow further type conversion after
widen-mult operation. Instead, I detect the following pattern in
vect_recog_widen_mult_pattern():

T1 a, b;
ai = (T2) a;
bi = (T2) b;
c = ai * bi;

where T2 is more that double the size of T1. (e.g. T1 is char and T2 is int).

In this case I just create a new type T3 whose size is double of the
size of T1, then get an intermediate result of type T3 from
widen-mult. Then I add a new statement to STMT_VINFO_PATTERN_DEF_SEQ
converting the result into type T2.

This strategy makes the patch more clean.

Bootstrapped and tested on an x86-64 machine.


thanks,
Cong


      S1  a_t = ;
@@ -538,11 +539,12 @@ vect_handle_widen_op_by_const (gimple stmt, enum
tree_code code,
      S4  b_T = (TYPE) b_t;
      S5  prod_T = a_T * b_T;

-   where type 'TYPE' is at least double the size of type 'type'.
+   where type 'TYPE' is at least double the size of type 'type1' and 'type2'.

    Also detect unsigned cases:

-     unsigned type a_t, b_t;
+     unsigned type1 a_t;
+     unsigned type2 b_t;
      unsigned TYPE u_prod_T;
      TYPE a_T, b_T, prod_T;

@@ -596,6 +598,8 @@ vect_handle_widen_op_by_const (gimple stmt, enum
tree_code code,
    * Return value: A new stmt that will be used to replace the sequence of
    stmts that constitute the pattern.  In this case it will be:
         WIDEN_MULT <a_t, b_t>
+   If the result of WIDEN_MULT needs to be converted to a larger type, the
+   returned stmt will be this type conversion stmt.
 */

 static gimple
@@ -606,8 +610,8 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,
   gimple def_stmt0, def_stmt1;
   tree oprnd0, oprnd1;
   tree type, half_type0, half_type1;
-  gimple pattern_stmt;
-  tree vectype, vectype_out = NULL_TREE;
+  gimple new_stmt = NULL, pattern_stmt = NULL;
+  tree vectype, vecitype;
   tree var;
   enum tree_code dummy_code;
   int dummy_int;
@@ -661,6 +665,33 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,
         return NULL;
     }

+  /* If the two arguments have different sizes, convert the one with
+     the smaller type into the larger type.  */
+  if (TYPE_PRECISION (half_type0) != TYPE_PRECISION (half_type1))
+    {
+      tree* oprnd = NULL;
+      gimple def_stmt = NULL;
+
+      if (TYPE_PRECISION (half_type0) < TYPE_PRECISION (half_type1))
+ {
+  def_stmt = def_stmt0;
+  half_type0 = half_type1;
+  oprnd = &oprnd0;
+ }
+      else
+ {
+  def_stmt = def_stmt1;
+  half_type1 = half_type0;
+  oprnd = &oprnd1;
+ }
+
+        tree old_oprnd = gimple_assign_rhs1 (def_stmt);
+        tree new_oprnd = make_ssa_name (half_type0, NULL);
+        new_stmt = gimple_build_assign_with_ops (NOP_EXPR, new_oprnd,
+                                                 old_oprnd, NULL_TREE);
+        *oprnd = new_oprnd;
+    }
+
   /* Handle unsigned case.  Look for
      S6  u_prod_T = (unsigned TYPE) prod_T;
      Use unsigned TYPE as the type for WIDEN_MULT_EXPR.  */
@@ -692,6 +723,15 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,
   if (!types_compatible_p (half_type0, half_type1))
     return NULL;

+  /* If TYPE is more than twice larger than HALF_TYPE, we use WIDEN_MULT
+     to get an intermediate result of type ITYPE.  In this case we need
+     to build a statement to convert this intermediate result to type TYPE.  */
+  tree itype = type;
+  if (TYPE_PRECISION (type) > TYPE_PRECISION (half_type0) * 2)
+    itype = build_nonstandard_integer_type
+              (GET_MODE_BITSIZE (TYPE_MODE (half_type0)) * 2,
+               TYPE_UNSIGNED (type));
+
   /* Pattern detected.  */
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
@@ -699,23 +739,56 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,

   /* Check target support  */
   vectype = get_vectype_for_scalar_type (half_type0);
-  vectype_out = get_vectype_for_scalar_type (type);
+  vecitype = get_vectype_for_scalar_type (itype);
   if (!vectype
-      || !vectype_out
+      || !vecitype
       || !supportable_widening_operation (WIDEN_MULT_EXPR, last_stmt,
-  vectype_out, vectype,
+  vecitype, vectype,
   &dummy_code, &dummy_code,
   &dummy_int, &dummy_vec))
     return NULL;

   *type_in = vectype;
-  *type_out = vectype_out;
+  *type_out = get_vectype_for_scalar_type (type);

   /* Pattern supported. Create a stmt to be used to replace the pattern: */
-  var = vect_recog_temp_ssa_var (type, NULL);
+  var = vect_recog_temp_ssa_var (itype, NULL);
   pattern_stmt = gimple_build_assign_with_ops (WIDEN_MULT_EXPR, var, oprnd0,
        oprnd1);

+  stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
+  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
+  STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) = NULL;
+
+  /* If the original two operands have different sizes, we may need to convert
+     the smaller one into the larget type.  If this is the case, at this point
+     the new stmt is already built.  */
+  if (new_stmt)
+    {
+      append_pattern_def_seq (stmt_vinfo, new_stmt);
+      stmt_vec_info new_stmt_info
+        = new_stmt_vec_info (new_stmt, loop_vinfo, bb_vinfo);
+      set_vinfo_for_stmt (new_stmt, new_stmt_info);
+      STMT_VINFO_VECTYPE (new_stmt_info) = vectype;
+    }
+
+  /* If ITYPE is not TYPE, we need to build a type convertion stmt to convert
+     the result of the widen-mult operation into type TYPE.  */
+  if (itype != type)
+    {
+      append_pattern_def_seq (stmt_vinfo, pattern_stmt);
+      stmt_vec_info pattern_stmt_info
+        = new_stmt_vec_info (pattern_stmt, loop_vinfo, bb_vinfo);
+      set_vinfo_for_stmt (pattern_stmt, pattern_stmt_info);
+      STMT_VINFO_VECTYPE (pattern_stmt_info) = vecitype;
+      pattern_stmt
+        = gimple_build_assign_with_ops (NOP_EXPR,
+                                        vect_recog_temp_ssa_var (type, NULL),
+                                        gimple_assign_lhs (pattern_stmt),
+                                        NULL_TREE);
+    }
+
   if (dump_enabled_p ())
     dump_gimple_stmt_loc (MSG_NOTE, vect_location, TDF_SLIM, pattern_stmt, 0);






On Tue, Dec 3, 2013 at 2:25 PM, Cong Hou <congh@google.com> wrote:
> Hi
>
> The current widen-mult pattern only considers two operands with the
> same size. However, operands with different sizes can also benefit
> from this pattern. The following loop shows such an example:
>
>
> char a[N];
> short b[N];
> int c[N];
>
> for (int i = 0; i < N; ++i)
>   c[i] = a[i] * b[i];
>
>
> In this case, we can convert a[i] into short type then perform
> widen-mult on b[i] and the converted value:
>
>
> for (int i = 0; i < N; ++i) {
>   short t = a[i];
>   c[i] = t w* b[i];
> }
>
>
> This patch adds such support. In addition, the following loop fails to
> be recognized as a widen-mult pattern because the widening operation
> from char to int is not directly supported by the target:
>
>
> char a[N], b[N];
> int c[N];
>
> for (int i = 0; i < N; ++i)
>   c[i] = a[i] * b[i];
>
>
> In this case, we can still perform widen-mult on a[i] and b[i], and
> get a result of short type, then convert it to int:
>
>
> char a[N], b[N];
> int c[N];
>
> for (int i = 0; i < N; ++i) {
>   short t = a[i] w* b[i];
>   c[i] = (int) t;
> }
>
>
> Currently GCC does not allow multi-step conversions for binary
> widening operations. This pattern removes this restriction and use
> VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR to arrange data after the
> widen-mult is performed for the widen-mult pattern. This can reduce
> several unpacking instructions (for this example, the number of
> packings/unpackings is reduced from 12 to 8. For SSE2, the inefficient
> multiplication between two V4SI vectors can also be avoided).
>
> Bootstrapped and tested on an x86_64 machine.
>
>
>
> thanks,
> Cong
>
>
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index f298c0b..44ed204 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,12 @@
> +2013-12-02  Cong Hou  <congh@google.com>
> +
> + * tree-vect-patterns.c (vect_recog_widen_mult_pattern): Enhance
> + the widen-mult pattern by handling two operands with different
> + sizes.
> + * tree-vect-stmts.c (vectorizable_conversion): Allow multi-steps
> + conversions after widening mult operation.
> + (supportable_widening_operation): Likewise.
> +
>  2013-11-22  Jakub Jelinek  <jakub@redhat.com>
>
>   PR sanitizer/59061
> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
> index 12d2c90..611ae1c 100644
> --- a/gcc/testsuite/ChangeLog
> +++ b/gcc/testsuite/ChangeLog
> @@ -1,3 +1,8 @@
> +2013-12-02  Cong Hou  <congh@google.com>
> +
> + * gcc.dg/vect/vect-widen-mult-u8-s16-s32.c: New test.
> + * gcc.dg/vect/vect-widen-mult-u8-u32.c: New test.
> +
>  2013-11-22  Jakub Jelinek  <jakub@redhat.com>
>
>   * c-c++-common/asan/no-redundant-instrumentation-7.c: Fix
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
> b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
> new file mode 100644
> index 0000000..9f9081b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
> @@ -0,0 +1,48 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdarg.h>
> +#include "tree-vect.h"
> +
> +#define N 64
> +
> +unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
> +short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
> +int result[N];
> +
> +/* unsigned char * short -> int widening-mult.  */
> +__attribute__ ((noinline)) int
> +foo1(int len) {
> +  int i;
> +
> +  for (i=0; i<len; i++) {
> +    result[i] = X[i] * Y[i];
> +  }
> +}
> +
> +int main (void)
> +{
> +  int i;
> +
> +  check_vect ();
> +
> +  for (i=0; i<N; i++) {
> +    X[i] = i;
> +    Y[i] = 64-i;
> +    __asm__ volatile ("");
> +  }
> +
> +  foo1 (N);
> +
> +  for (i=0; i<N; i++) {
> +    if (result[i] != X[i] * Y[i])
> +      abort ();
> +  }
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
> target { vect_widen_mult_hi_to_si || vect_unpack } } } } */
> +/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern:
> detected" 1 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
> +/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" {
> target vect_widen_mult_hi_to_si_pattern } } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
> +
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
> b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
> new file mode 100644
> index 0000000..51e9178
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
> @@ -0,0 +1,48 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdarg.h>
> +#include "tree-vect.h"
> +
> +#define N 64
> +
> +unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
> +unsigned char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
> +unsigned int result[N];
> +
> +/* unsigned char-> unsigned int widening-mult.  */
> +__attribute__ ((noinline)) int
> +foo1(int len) {
> +  int i;
> +
> +  for (i=0; i<len; i++) {
> +    result[i] = X[i] * Y[i];
> +  }
> +}
> +
> +int main (void)
> +{
> +  int i;
> +
> +  check_vect ();
> +
> +  for (i=0; i<N; i++) {
> +    X[i] = i;
> +    Y[i] = 64-i;
> +    __asm__ volatile ("");
> +  }
> +
> +  foo1 (N);
> +
> +  for (i=0; i<N; i++) {
> +    if (result[i] != X[i] * Y[i])
> +      abort ();
> +  }
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
> target { vect_widen_mult_qi_to_hi || vect_unpack } } } } */
> +/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern:
> detected" 1 "vect" { target vect_widen_mult_qi_to_hi_pattern } } } */
> +/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" {
> target vect_widen_mult_qi_to_hi_pattern } } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
> +
> diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
> index 7823cc3..06279bc 100644
> --- a/gcc/tree-vect-patterns.c
> +++ b/gcc/tree-vect-patterns.c
> @@ -529,7 +529,8 @@ vect_handle_widen_op_by_const (gimple stmt, enum
> tree_code code,
>
>     Try to find the following pattern:
>
> -     type a_t, b_t;
> +     type1 a_t;
> +     type2 b_t;
>       TYPE a_T, b_T, prod_T;
>
>       S1  a_t = ;
> @@ -538,11 +539,12 @@ vect_handle_widen_op_by_const (gimple stmt, enum
> tree_code code,
>       S4  b_T = (TYPE) b_t;
>       S5  prod_T = a_T * b_T;
>
> -   where type 'TYPE' is at least double the size of type 'type'.
> +   where type 'TYPE' is at least double the size of type 'type1' and 'type2'.
>
>     Also detect unsigned cases:
>
> -     unsigned type a_t, b_t;
> +     unsigned type1 a_t;
> +     unsigned type2 b_t;
>       unsigned TYPE u_prod_T;
>       TYPE a_T, b_T, prod_T;
>
> @@ -661,6 +663,50 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,
>          return NULL;
>      }
>
> +  /* If the two arguments have different sizes, convert the one with
> +     the smaller type into the larger type.  */
> +  if (TYPE_PRECISION (half_type0) != TYPE_PRECISION (half_type1))
> +    {
> +      tree* oprnd = NULL;
> +      gimple def_stmt = NULL;
> +
> +      if (TYPE_PRECISION (half_type0) < TYPE_PRECISION (half_type1))
> + {
> +  def_stmt = def_stmt0;
> +  half_type0 = half_type1;
> +  oprnd = &oprnd0;
> + }
> +      else
> + {
> +  def_stmt = def_stmt1;
> +  half_type1 = half_type0;
> +  oprnd = &oprnd1;
> + }
> +
> +      if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)))
> + {
> +  gimple new_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
> +  /* Check if the already created pattern stmt is what we need.  */
> +  if (!is_gimple_assign (new_stmt)
> +      || gimple_assign_rhs_code (new_stmt) != NOP_EXPR
> +      || TREE_TYPE (gimple_assign_lhs (new_stmt)) != half_type0)
> +    return NULL;
> +
> +  stmts->safe_push (def_stmt);
> +  *oprnd = gimple_assign_lhs (new_stmt);
> + }
> +      else
> + {
> +  tree old_oprnd = gimple_assign_rhs1 (def_stmt);
> +  tree new_oprnd = make_ssa_name (half_type0, NULL);
> +  gimple new_stmt = gimple_build_assign_with_ops (NOP_EXPR, new_oprnd,
> +  old_oprnd, NULL_TREE);
> +  STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)) = new_stmt;
> +  stmts->safe_push (def_stmt);
> +  *oprnd = new_oprnd;
> + }
> +    }
> +
>    /* Handle unsigned case.  Look for
>       S6  u_prod_T = (unsigned TYPE) prod_T;
>       Use unsigned TYPE as the type for WIDEN_MULT_EXPR.  */
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 72dfacd..e1ca2a2 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -2504,12 +2504,7 @@ vectorizable_conversion (gimple stmt,
> gimple_stmt_iterator *gsi,
>        if (supportable_widening_operation (code, stmt, vectype_out, vectype_in,
>    &code1, &code2, &multi_step_cvt,
>    &interm_types))
> - {
> -  /* Binary widening operation can only be supported directly by the
> -     architecture.  */
> -  gcc_assert (!(multi_step_cvt && op_type == binary_op));
> -  break;
> - }
> + break;
>
>        if (code != FLOAT_EXPR
>    || (GET_MODE_SIZE (TYPE_MODE (lhs_type))
> @@ -2787,6 +2782,15 @@ vectorizable_conversion (gimple stmt,
> gimple_stmt_iterator *gsi,
>    c1 = codecvt1;
>    c2 = codecvt2;
>   }
> +      else if (op_type == binary_op && i < multi_step_cvt)
> + {
> +  /* For binary widening operations, if multi-steps are needed,
> +     use VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR to convert
> +     intermediate result to final one.  */
> +  c1 = VEC_UNPACK_LO_EXPR;
> +  c2 = VEC_UNPACK_HI_EXPR;
> +  op_type = unary_op;
> + }
>        vect_create_vectorized_promotion_stmts (&vec_oprnds0,
>        &vec_oprnds1,
>        stmt, this_dest, gsi,
> @@ -6510,7 +6514,7 @@ supportable_widening_operation (enum tree_code
> code, gimple stmt,
>    enum tree_code c1, c2;
>    int i;
>    tree prev_type, intermediate_type;
> -  enum machine_mode intermediate_mode, prev_mode;
> +  enum machine_mode intermediate_mode;
>    optab optab3, optab4;
>
>    *multi_step_cvt = 0;
> @@ -6634,11 +6638,17 @@ supportable_widening_operation (enum tree_code
> code, gimple stmt,
>       types.  */
>
>    prev_type = vectype;
> -  prev_mode = vec_mode;
>
> -  if (!CONVERT_EXPR_CODE_P (code))
> +  /* For WIDEN_MULT_EXPR, it is possible that two steps are needed.  For
> +     example, consider WIDEN_MULT_EXPR from char to int.  In the first step,
> +     we can get the widen-mult result as short, and then widen it again to int.
> +     VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR are used in the second step.  */
> +  if (!CONVERT_EXPR_CODE_P (code) && code != WIDEN_MULT_EXPR)
>      return false;
>
> +  c1 = VEC_UNPACK_LO_EXPR;
> +  c2 = VEC_UNPACK_HI_EXPR;
> +
>    /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
>       intermediate steps in promotion sequence.  We try
>       MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do
> @@ -6654,10 +6664,6 @@ supportable_widening_operation (enum tree_code
> code, gimple stmt,
>        optab4 = optab_for_tree_code (c2, intermediate_type, optab_default);
>
>        if (!optab3 || !optab4
> -          || (icode1 = optab_handler (optab1, prev_mode)) == CODE_FOR_nothing
> -  || insn_data[icode1].operand[0].mode != intermediate_mode
> -  || (icode2 = optab_handler (optab2, prev_mode)) == CODE_FOR_nothing
> -  || insn_data[icode2].operand[0].mode != intermediate_mode
>    || ((icode1 = optab_handler (optab3, intermediate_mode))
>        == CODE_FOR_nothing)
>    || ((icode2 = optab_handler (optab4, intermediate_mode))
> @@ -6672,7 +6678,6 @@ supportable_widening_operation (enum tree_code
> code, gimple stmt,
>   return true;
>
>        prev_type = intermediate_type;
> -      prev_mode = intermediate_mode;
>      }
>
>    interm_types->release ();
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index f298c0b..12990b2 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2013-12-02  Cong Hou  <congh@google.com>
+
+	* tree-vect-patterns.c (vect_recog_widen_mult_pattern): Enhance
+	the widen-mult pattern by handling two operands with different
+	sizes, and operands whose size is smaller than half of the result
+	type.
+
 2013-11-22  Jakub Jelinek  <jakub@redhat.com>
 
 	PR sanitizer/59061
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 12d2c90..611ae1c 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2013-12-02  Cong Hou  <congh@google.com>
+
+	* gcc.dg/vect/vect-widen-mult-u8-s16-s32.c: New test.
+	* gcc.dg/vect/vect-widen-mult-u8-u32.c: New test.
+
 2013-11-22  Jakub Jelinek  <jakub@redhat.com>
 
 	* c-c++-common/asan/no-redundant-instrumentation-7.c: Fix
diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
new file mode 100644
index 0000000..9f9081b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
@@ -0,0 +1,48 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int result[N];
+
+/* unsigned char * short -> int widening-mult.  */
+__attribute__ ((noinline)) int
+foo1(int len) {
+  int i;
+
+  for (i=0; i<len; i++) {
+    result[i] = X[i] * Y[i];
+  }
+}
+
+int main (void)
+{
+  int i;
+
+  check_vect ();
+
+  for (i=0; i<N; i++) {
+    X[i] = i;
+    Y[i] = 64-i;
+    __asm__ volatile ("");
+  }
+
+  foo1 (N);
+
+  for (i=0; i<N; i++) {
+    if (result[i] != X[i] * Y[i])
+      abort ();
+  }
+  
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_widen_mult_hi_to_si || vect_unpack } } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
+/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
new file mode 100644
index 0000000..12c4692
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
@@ -0,0 +1,48 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+unsigned char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+unsigned int result[N];
+
+/* unsigned char-> unsigned int widening-mult.  */
+__attribute__ ((noinline)) int
+foo1(int len) {
+  int i;
+
+  for (i=0; i<len; i++) {
+    result[i] = X[i] * Y[i];
+  }
+}
+
+int main (void)
+{
+  int i;
+
+  check_vect ();
+
+  for (i=0; i<N; i++) {
+    X[i] = i;
+    Y[i] = 64-i;
+    __asm__ volatile ("");
+  }
+
+  foo1 (N);
+
+  for (i=0; i<N; i++) {
+    if (result[i] != X[i] * Y[i])
+      abort ();
+  }
+  
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_widen_mult_qi_to_hi || vect_unpack } } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 1 "vect" { target vect_widen_mult_qi_to_hi_pattern } } } */
+/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" { target vect_widen_mult_qi_to_hi_pattern } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 7823cc3..f412e2d 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -529,7 +529,8 @@ vect_handle_widen_op_by_const (gimple stmt, enum tree_code code,
 
    Try to find the following pattern:
 
-     type a_t, b_t;
+     type1 a_t;
+     type2 b_t;
      TYPE a_T, b_T, prod_T;
 
      S1  a_t = ;
@@ -538,11 +539,12 @@ vect_handle_widen_op_by_const (gimple stmt, enum tree_code code,
      S4  b_T = (TYPE) b_t;
      S5  prod_T = a_T * b_T;
 
-   where type 'TYPE' is at least double the size of type 'type'.
+   where type 'TYPE' is at least double the size of type 'type1' and 'type2'.
 
    Also detect unsigned cases:
 
-     unsigned type a_t, b_t;
+     unsigned type1 a_t;
+     unsigned type2 b_t;
      unsigned TYPE u_prod_T;
      TYPE a_T, b_T, prod_T;
 
@@ -596,6 +598,8 @@ vect_handle_widen_op_by_const (gimple stmt, enum tree_code code,
    * Return value: A new stmt that will be used to replace the sequence of
    stmts that constitute the pattern.  In this case it will be:
         WIDEN_MULT <a_t, b_t>
+   If the result of WIDEN_MULT needs to be converted to a larger type, the
+   returned stmt will be this type conversion stmt.
 */
 
 static gimple
@@ -606,8 +610,8 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,
   gimple def_stmt0, def_stmt1;
   tree oprnd0, oprnd1;
   tree type, half_type0, half_type1;
-  gimple pattern_stmt;
-  tree vectype, vectype_out = NULL_TREE;
+  gimple new_stmt = NULL, pattern_stmt = NULL;
+  tree vectype, vecitype;
   tree var;
   enum tree_code dummy_code;
   int dummy_int;
@@ -661,6 +665,33 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,
         return NULL;
     }
 
+  /* If the two arguments have different sizes, convert the one with
+     the smaller type into the larger type.  */
+  if (TYPE_PRECISION (half_type0) != TYPE_PRECISION (half_type1))
+    {
+      tree* oprnd = NULL;
+      gimple def_stmt = NULL;
+
+      if (TYPE_PRECISION (half_type0) < TYPE_PRECISION (half_type1))
+	{
+	  def_stmt = def_stmt0;
+	  half_type0 = half_type1;
+	  oprnd = &oprnd0;
+	}
+      else
+	{
+	  def_stmt = def_stmt1;
+	  half_type1 = half_type0;
+	  oprnd = &oprnd1;
+	}
+
+        tree old_oprnd = gimple_assign_rhs1 (def_stmt);
+        tree new_oprnd = make_ssa_name (half_type0, NULL);
+        new_stmt = gimple_build_assign_with_ops (NOP_EXPR, new_oprnd,
+                                                 old_oprnd, NULL_TREE);
+        *oprnd = new_oprnd;
+    }
+
   /* Handle unsigned case.  Look for
      S6  u_prod_T = (unsigned TYPE) prod_T;
      Use unsigned TYPE as the type for WIDEN_MULT_EXPR.  */
@@ -692,6 +723,15 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,
   if (!types_compatible_p (half_type0, half_type1))
     return NULL;
 
+  /* If TYPE is more than twice larger than HALF_TYPE, we use WIDEN_MULT
+     to get an intermediate result of type ITYPE.  In this case we need
+     to build a statement to convert this intermediate result to type TYPE.  */
+  tree itype = type;
+  if (TYPE_PRECISION (type) > TYPE_PRECISION (half_type0) * 2)
+    itype = build_nonstandard_integer_type
+              (GET_MODE_BITSIZE (TYPE_MODE (half_type0)) * 2,
+               TYPE_UNSIGNED (type));
+
   /* Pattern detected.  */
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
@@ -699,23 +739,56 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,
 
   /* Check target support  */
   vectype = get_vectype_for_scalar_type (half_type0);
-  vectype_out = get_vectype_for_scalar_type (type);
+  vecitype = get_vectype_for_scalar_type (itype);
   if (!vectype
-      || !vectype_out
+      || !vecitype
       || !supportable_widening_operation (WIDEN_MULT_EXPR, last_stmt,
-					  vectype_out, vectype,
+					  vecitype, vectype,
 					  &dummy_code, &dummy_code,
 					  &dummy_int, &dummy_vec))
     return NULL;
 
   *type_in = vectype;
-  *type_out = vectype_out;
+  *type_out = get_vectype_for_scalar_type (type);
 
   /* Pattern supported. Create a stmt to be used to replace the pattern: */
-  var = vect_recog_temp_ssa_var (type, NULL);
+  var = vect_recog_temp_ssa_var (itype, NULL);
   pattern_stmt = gimple_build_assign_with_ops (WIDEN_MULT_EXPR, var, oprnd0,
 					       oprnd1);
 
+  stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
+  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
+  STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) = NULL;
+
+  /* If the original two operands have different sizes, we may need to convert
+     the smaller one into the larget type.  If this is the case, at this point
+     the new stmt is already built.  */
+  if (new_stmt)
+    {
+      append_pattern_def_seq (stmt_vinfo, new_stmt);
+      stmt_vec_info new_stmt_info
+        = new_stmt_vec_info (new_stmt, loop_vinfo, bb_vinfo);
+      set_vinfo_for_stmt (new_stmt, new_stmt_info);
+      STMT_VINFO_VECTYPE (new_stmt_info) = vectype;
+    }
+
+  /* If ITYPE is not TYPE, we need to build a type convertion stmt to convert
+     the result of the widen-mult operation into type TYPE.  */
+  if (itype != type)
+    {
+      append_pattern_def_seq (stmt_vinfo, pattern_stmt);
+      stmt_vec_info pattern_stmt_info
+        = new_stmt_vec_info (pattern_stmt, loop_vinfo, bb_vinfo);
+      set_vinfo_for_stmt (pattern_stmt, pattern_stmt_info);
+      STMT_VINFO_VECTYPE (pattern_stmt_info) = vecitype;
+      pattern_stmt
+        = gimple_build_assign_with_ops (NOP_EXPR,
+                                        vect_recog_temp_ssa_var (type, NULL),
+                                        gimple_assign_lhs (pattern_stmt),
+                                        NULL_TREE);
+    }
+
   if (dump_enabled_p ())
     dump_gimple_stmt_loc (MSG_NOTE, vect_location, TDF_SLIM, pattern_stmt, 0);
Richard Guenther - April 17, 2014, 9:54 a.m.
On Sat, Dec 7, 2013 at 12:45 AM, Cong Hou <congh@google.com> wrote:
> After further reviewing this patch, I found I don't have to change the
> code in tree-vect-stmts.c to allow further type conversion after
> widen-mult operation. Instead, I detect the following pattern in
> vect_recog_widen_mult_pattern():
>
> T1 a, b;
> ai = (T2) a;
> bi = (T2) b;
> c = ai * bi;
>
> where T2 is more that double the size of T1. (e.g. T1 is char and T2 is int).
>
> In this case I just create a new type T3 whose size is double of the
> size of T1, then get an intermediate result of type T3 from
> widen-mult. Then I add a new statement to STMT_VINFO_PATTERN_DEF_SEQ
> converting the result into type T2.
>
> This strategy makes the patch more clean.
>
> Bootstrapped and tested on an x86-64 machine.

Ok for trunk (please re-bootstrap/test of course).

Thanks,
Richard.

>
> thanks,
> Cong
>
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index f298c0b..12990b2 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,10 @@
> +2013-12-02  Cong Hou  <congh@google.com>
> +
> + * tree-vect-patterns.c (vect_recog_widen_mult_pattern): Enhance
> + the widen-mult pattern by handling two operands with different
> + sizes, and operands whose size is smaller than half of the result
> + type.
> +
>  2013-11-22  Jakub Jelinek  <jakub@redhat.com>
>
>   PR sanitizer/59061
> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
> index 12d2c90..611ae1c 100644
> --- a/gcc/testsuite/ChangeLog
> +++ b/gcc/testsuite/ChangeLog
> @@ -1,3 +1,8 @@
> +2013-12-02  Cong Hou  <congh@google.com>
> +
> + * gcc.dg/vect/vect-widen-mult-u8-s16-s32.c: New test.
> + * gcc.dg/vect/vect-widen-mult-u8-u32.c: New test.
> +
>  2013-11-22  Jakub Jelinek  <jakub@redhat.com>
>
>   * c-c++-common/asan/no-redundant-instrumentation-7.c: Fix
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
> b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
> new file mode 100644
> index 0000000..9f9081b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
> @@ -0,0 +1,48 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdarg.h>
> +#include "tree-vect.h"
> +
> +#define N 64
> +
> +unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
> +short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
> +int result[N];
> +
> +/* unsigned char * short -> int widening-mult.  */
> +__attribute__ ((noinline)) int
> +foo1(int len) {
> +  int i;
> +
> +  for (i=0; i<len; i++) {
> +    result[i] = X[i] * Y[i];
> +  }
> +}
> +
> +int main (void)
> +{
> +  int i;
> +
> +  check_vect ();
> +
> +  for (i=0; i<N; i++) {
> +    X[i] = i;
> +    Y[i] = 64-i;
> +    __asm__ volatile ("");
> +  }
> +
> +  foo1 (N);
> +
> +  for (i=0; i<N; i++) {
> +    if (result[i] != X[i] * Y[i])
> +      abort ();
> +  }
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
> target { vect_widen_mult_hi_to_si || vect_unpack } } } } */
> +/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern:
> detected" 1 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
> +/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" {
> target vect_widen_mult_hi_to_si_pattern } } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
> +
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
> b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
> new file mode 100644
> index 0000000..12c4692
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
> @@ -0,0 +1,48 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdarg.h>
> +#include "tree-vect.h"
> +
> +#define N 64
> +
> +unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
> +unsigned char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
> +unsigned int result[N];
> +
> +/* unsigned char-> unsigned int widening-mult.  */
> +__attribute__ ((noinline)) int
> +foo1(int len) {
> +  int i;
> +
> +  for (i=0; i<len; i++) {
> +    result[i] = X[i] * Y[i];
> +  }
> +}
> +
> +int main (void)
> +{
> +  int i;
> +
> +  check_vect ();
> +
> +  for (i=0; i<N; i++) {
> +    X[i] = i;
> +    Y[i] = 64-i;
> +    __asm__ volatile ("");
> +  }
> +
> +  foo1 (N);
> +
> +  for (i=0; i<N; i++) {
> +    if (result[i] != X[i] * Y[i])
> +      abort ();
> +  }
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
> target { vect_widen_mult_qi_to_hi || vect_unpack } } } } */
> +/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern:
> detected" 1 "vect" { target vect_widen_mult_qi_to_hi_pattern } } } */
> +/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" {
> target vect_widen_mult_qi_to_hi_pattern } } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
> +
> diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
> index 7823cc3..f412e2d 100644
> --- a/gcc/tree-vect-patterns.c
> +++ b/gcc/tree-vect-patterns.c
> @@ -529,7 +529,8 @@ vect_handle_widen_op_by_const (gimple stmt, enum
> tree_code code,
>
>     Try to find the following pattern:
>
> -     type a_t, b_t;
> +     type1 a_t;
> +     type2 b_t;
>       TYPE a_T, b_T, prod_T;
>
>       S1  a_t = ;
> @@ -538,11 +539,12 @@ vect_handle_widen_op_by_const (gimple stmt, enum
> tree_code code,
>       S4  b_T = (TYPE) b_t;
>       S5  prod_T = a_T * b_T;
>
> -   where type 'TYPE' is at least double the size of type 'type'.
> +   where type 'TYPE' is at least double the size of type 'type1' and 'type2'.
>
>     Also detect unsigned cases:
>
> -     unsigned type a_t, b_t;
> +     unsigned type1 a_t;
> +     unsigned type2 b_t;
>       unsigned TYPE u_prod_T;
>       TYPE a_T, b_T, prod_T;
>
> @@ -596,6 +598,8 @@ vect_handle_widen_op_by_const (gimple stmt, enum
> tree_code code,
>     * Return value: A new stmt that will be used to replace the sequence of
>     stmts that constitute the pattern.  In this case it will be:
>          WIDEN_MULT <a_t, b_t>
> +   If the result of WIDEN_MULT needs to be converted to a larger type, the
> +   returned stmt will be this type conversion stmt.
>  */
>
>  static gimple
> @@ -606,8 +610,8 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,
>    gimple def_stmt0, def_stmt1;
>    tree oprnd0, oprnd1;
>    tree type, half_type0, half_type1;
> -  gimple pattern_stmt;
> -  tree vectype, vectype_out = NULL_TREE;
> +  gimple new_stmt = NULL, pattern_stmt = NULL;
> +  tree vectype, vecitype;
>    tree var;
>    enum tree_code dummy_code;
>    int dummy_int;
> @@ -661,6 +665,33 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,
>          return NULL;
>      }
>
> +  /* If the two arguments have different sizes, convert the one with
> +     the smaller type into the larger type.  */
> +  if (TYPE_PRECISION (half_type0) != TYPE_PRECISION (half_type1))
> +    {
> +      tree* oprnd = NULL;
> +      gimple def_stmt = NULL;
> +
> +      if (TYPE_PRECISION (half_type0) < TYPE_PRECISION (half_type1))
> + {
> +  def_stmt = def_stmt0;
> +  half_type0 = half_type1;
> +  oprnd = &oprnd0;
> + }
> +      else
> + {
> +  def_stmt = def_stmt1;
> +  half_type1 = half_type0;
> +  oprnd = &oprnd1;
> + }
> +
> +        tree old_oprnd = gimple_assign_rhs1 (def_stmt);
> +        tree new_oprnd = make_ssa_name (half_type0, NULL);
> +        new_stmt = gimple_build_assign_with_ops (NOP_EXPR, new_oprnd,
> +                                                 old_oprnd, NULL_TREE);
> +        *oprnd = new_oprnd;
> +    }
> +
>    /* Handle unsigned case.  Look for
>       S6  u_prod_T = (unsigned TYPE) prod_T;
>       Use unsigned TYPE as the type for WIDEN_MULT_EXPR.  */
> @@ -692,6 +723,15 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,
>    if (!types_compatible_p (half_type0, half_type1))
>      return NULL;
>
> +  /* If TYPE is more than twice larger than HALF_TYPE, we use WIDEN_MULT
> +     to get an intermediate result of type ITYPE.  In this case we need
> +     to build a statement to convert this intermediate result to type TYPE.  */
> +  tree itype = type;
> +  if (TYPE_PRECISION (type) > TYPE_PRECISION (half_type0) * 2)
> +    itype = build_nonstandard_integer_type
> +              (GET_MODE_BITSIZE (TYPE_MODE (half_type0)) * 2,
> +               TYPE_UNSIGNED (type));
> +
>    /* Pattern detected.  */
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location,
> @@ -699,23 +739,56 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,
>
>    /* Check target support  */
>    vectype = get_vectype_for_scalar_type (half_type0);
> -  vectype_out = get_vectype_for_scalar_type (type);
> +  vecitype = get_vectype_for_scalar_type (itype);
>    if (!vectype
> -      || !vectype_out
> +      || !vecitype
>        || !supportable_widening_operation (WIDEN_MULT_EXPR, last_stmt,
> -  vectype_out, vectype,
> +  vecitype, vectype,
>    &dummy_code, &dummy_code,
>    &dummy_int, &dummy_vec))
>      return NULL;
>
>    *type_in = vectype;
> -  *type_out = vectype_out;
> +  *type_out = get_vectype_for_scalar_type (type);
>
>    /* Pattern supported. Create a stmt to be used to replace the pattern: */
> -  var = vect_recog_temp_ssa_var (type, NULL);
> +  var = vect_recog_temp_ssa_var (itype, NULL);
>    pattern_stmt = gimple_build_assign_with_ops (WIDEN_MULT_EXPR, var, oprnd0,
>         oprnd1);
>
> +  stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
> +  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
> +  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
> +  STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) = NULL;
> +
> +  /* If the original two operands have different sizes, we may need to convert
> +     the smaller one into the larget type.  If this is the case, at this point
> +     the new stmt is already built.  */
> +  if (new_stmt)
> +    {
> +      append_pattern_def_seq (stmt_vinfo, new_stmt);
> +      stmt_vec_info new_stmt_info
> +        = new_stmt_vec_info (new_stmt, loop_vinfo, bb_vinfo);
> +      set_vinfo_for_stmt (new_stmt, new_stmt_info);
> +      STMT_VINFO_VECTYPE (new_stmt_info) = vectype;
> +    }
> +
> +  /* If ITYPE is not TYPE, we need to build a type convertion stmt to convert
> +     the result of the widen-mult operation into type TYPE.  */
> +  if (itype != type)
> +    {
> +      append_pattern_def_seq (stmt_vinfo, pattern_stmt);
> +      stmt_vec_info pattern_stmt_info
> +        = new_stmt_vec_info (pattern_stmt, loop_vinfo, bb_vinfo);
> +      set_vinfo_for_stmt (pattern_stmt, pattern_stmt_info);
> +      STMT_VINFO_VECTYPE (pattern_stmt_info) = vecitype;
> +      pattern_stmt
> +        = gimple_build_assign_with_ops (NOP_EXPR,
> +                                        vect_recog_temp_ssa_var (type, NULL),
> +                                        gimple_assign_lhs (pattern_stmt),
> +                                        NULL_TREE);
> +    }
> +
>    if (dump_enabled_p ())
>      dump_gimple_stmt_loc (MSG_NOTE, vect_location, TDF_SLIM, pattern_stmt, 0);
>
>
>
>
>
>
> On Tue, Dec 3, 2013 at 2:25 PM, Cong Hou <congh@google.com> wrote:
>> Hi
>>
>> The current widen-mult pattern only considers two operands with the
>> same size. However, operands with different sizes can also benefit
>> from this pattern. The following loop shows such an example:
>>
>>
>> char a[N];
>> short b[N];
>> int c[N];
>>
>> for (int i = 0; i < N; ++i)
>>   c[i] = a[i] * b[i];
>>
>>
>> In this case, we can convert a[i] into short type then perform
>> widen-mult on b[i] and the converted value:
>>
>>
>> for (int i = 0; i < N; ++i) {
>>   short t = a[i];
>>   c[i] = t w* b[i];
>> }
>>
>>
>> This patch adds such support. In addition, the following loop fails to
>> be recognized as a widen-mult pattern because the widening operation
>> from char to int is not directly supported by the target:
>>
>>
>> char a[N], b[N];
>> int c[N];
>>
>> for (int i = 0; i < N; ++i)
>>   c[i] = a[i] * b[i];
>>
>>
>> In this case, we can still perform widen-mult on a[i] and b[i], and
>> get a result of short type, then convert it to int:
>>
>>
>> char a[N], b[N];
>> int c[N];
>>
>> for (int i = 0; i < N; ++i) {
>>   short t = a[i] w* b[i];
>>   c[i] = (int) t;
>> }
>>
>>
>> Currently GCC does not allow multi-step conversions for binary
>> widening operations. This pattern removes this restriction and use
>> VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR to arrange data after the
>> widen-mult is performed for the widen-mult pattern. This can reduce
>> several unpacking instructions (for this example, the number of
>> packings/unpackings is reduced from 12 to 8. For SSE2, the inefficient
>> multiplication between two V4SI vectors can also be avoided).
>>
>> Bootstrapped and tested on an x86_64 machine.
>>
>>
>>
>> thanks,
>> Cong
>>
>>
>>
>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>> index f298c0b..44ed204 100644
>> --- a/gcc/ChangeLog
>> +++ b/gcc/ChangeLog
>> @@ -1,3 +1,12 @@
>> +2013-12-02  Cong Hou  <congh@google.com>
>> +
>> + * tree-vect-patterns.c (vect_recog_widen_mult_pattern): Enhance
>> + the widen-mult pattern by handling two operands with different
>> + sizes.
>> + * tree-vect-stmts.c (vectorizable_conversion): Allow multi-steps
>> + conversions after widening mult operation.
>> + (supportable_widening_operation): Likewise.
>> +
>>  2013-11-22  Jakub Jelinek  <jakub@redhat.com>
>>
>>   PR sanitizer/59061
>> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
>> index 12d2c90..611ae1c 100644
>> --- a/gcc/testsuite/ChangeLog
>> +++ b/gcc/testsuite/ChangeLog
>> @@ -1,3 +1,8 @@
>> +2013-12-02  Cong Hou  <congh@google.com>
>> +
>> + * gcc.dg/vect/vect-widen-mult-u8-s16-s32.c: New test.
>> + * gcc.dg/vect/vect-widen-mult-u8-u32.c: New test.
>> +
>>  2013-11-22  Jakub Jelinek  <jakub@redhat.com>
>>
>>   * c-c++-common/asan/no-redundant-instrumentation-7.c: Fix
>> diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
>> b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
>> new file mode 100644
>> index 0000000..9f9081b
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
>> @@ -0,0 +1,48 @@
>> +/* { dg-require-effective-target vect_int } */
>> +
>> +#include <stdarg.h>
>> +#include "tree-vect.h"
>> +
>> +#define N 64
>> +
>> +unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
>> +short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
>> +int result[N];
>> +
>> +/* unsigned char * short -> int widening-mult.  */
>> +__attribute__ ((noinline)) int
>> +foo1(int len) {
>> +  int i;
>> +
>> +  for (i=0; i<len; i++) {
>> +    result[i] = X[i] * Y[i];
>> +  }
>> +}
>> +
>> +int main (void)
>> +{
>> +  int i;
>> +
>> +  check_vect ();
>> +
>> +  for (i=0; i<N; i++) {
>> +    X[i] = i;
>> +    Y[i] = 64-i;
>> +    __asm__ volatile ("");
>> +  }
>> +
>> +  foo1 (N);
>> +
>> +  for (i=0; i<N; i++) {
>> +    if (result[i] != X[i] * Y[i])
>> +      abort ();
>> +  }
>> +
>> +  return 0;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
>> target { vect_widen_mult_hi_to_si || vect_unpack } } } } */
>> +/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern:
>> detected" 1 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
>> +/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" {
>> target vect_widen_mult_hi_to_si_pattern } } } */
>> +/* { dg-final { cleanup-tree-dump "vect" } } */
>> +
>> diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
>> b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
>> new file mode 100644
>> index 0000000..51e9178
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
>> @@ -0,0 +1,48 @@
>> +/* { dg-require-effective-target vect_int } */
>> +
>> +#include <stdarg.h>
>> +#include "tree-vect.h"
>> +
>> +#define N 64
>> +
>> +unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
>> +unsigned char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
>> +unsigned int result[N];
>> +
>> +/* unsigned char-> unsigned int widening-mult.  */
>> +__attribute__ ((noinline)) int
>> +foo1(int len) {
>> +  int i;
>> +
>> +  for (i=0; i<len; i++) {
>> +    result[i] = X[i] * Y[i];
>> +  }
>> +}
>> +
>> +int main (void)
>> +{
>> +  int i;
>> +
>> +  check_vect ();
>> +
>> +  for (i=0; i<N; i++) {
>> +    X[i] = i;
>> +    Y[i] = 64-i;
>> +    __asm__ volatile ("");
>> +  }
>> +
>> +  foo1 (N);
>> +
>> +  for (i=0; i<N; i++) {
>> +    if (result[i] != X[i] * Y[i])
>> +      abort ();
>> +  }
>> +
>> +  return 0;
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
>> target { vect_widen_mult_qi_to_hi || vect_unpack } } } } */
>> +/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern:
>> detected" 1 "vect" { target vect_widen_mult_qi_to_hi_pattern } } } */
>> +/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" {
>> target vect_widen_mult_qi_to_hi_pattern } } } */
>> +/* { dg-final { cleanup-tree-dump "vect" } } */
>> +
>> diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
>> index 7823cc3..06279bc 100644
>> --- a/gcc/tree-vect-patterns.c
>> +++ b/gcc/tree-vect-patterns.c
>> @@ -529,7 +529,8 @@ vect_handle_widen_op_by_const (gimple stmt, enum
>> tree_code code,
>>
>>     Try to find the following pattern:
>>
>> -     type a_t, b_t;
>> +     type1 a_t;
>> +     type2 b_t;
>>       TYPE a_T, b_T, prod_T;
>>
>>       S1  a_t = ;
>> @@ -538,11 +539,12 @@ vect_handle_widen_op_by_const (gimple stmt, enum
>> tree_code code,
>>       S4  b_T = (TYPE) b_t;
>>       S5  prod_T = a_T * b_T;
>>
>> -   where type 'TYPE' is at least double the size of type 'type'.
>> +   where type 'TYPE' is at least double the size of type 'type1' and 'type2'.
>>
>>     Also detect unsigned cases:
>>
>> -     unsigned type a_t, b_t;
>> +     unsigned type1 a_t;
>> +     unsigned type2 b_t;
>>       unsigned TYPE u_prod_T;
>>       TYPE a_T, b_T, prod_T;
>>
>> @@ -661,6 +663,50 @@ vect_recog_widen_mult_pattern (vec<gimple> *stmts,
>>          return NULL;
>>      }
>>
>> +  /* If the two arguments have different sizes, convert the one with
>> +     the smaller type into the larger type.  */
>> +  if (TYPE_PRECISION (half_type0) != TYPE_PRECISION (half_type1))
>> +    {
>> +      tree* oprnd = NULL;
>> +      gimple def_stmt = NULL;
>> +
>> +      if (TYPE_PRECISION (half_type0) < TYPE_PRECISION (half_type1))
>> + {
>> +  def_stmt = def_stmt0;
>> +  half_type0 = half_type1;
>> +  oprnd = &oprnd0;
>> + }
>> +      else
>> + {
>> +  def_stmt = def_stmt1;
>> +  half_type1 = half_type0;
>> +  oprnd = &oprnd1;
>> + }
>> +
>> +      if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)))
>> + {
>> +  gimple new_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
>> +  /* Check if the already created pattern stmt is what we need.  */
>> +  if (!is_gimple_assign (new_stmt)
>> +      || gimple_assign_rhs_code (new_stmt) != NOP_EXPR
>> +      || TREE_TYPE (gimple_assign_lhs (new_stmt)) != half_type0)
>> +    return NULL;
>> +
>> +  stmts->safe_push (def_stmt);
>> +  *oprnd = gimple_assign_lhs (new_stmt);
>> + }
>> +      else
>> + {
>> +  tree old_oprnd = gimple_assign_rhs1 (def_stmt);
>> +  tree new_oprnd = make_ssa_name (half_type0, NULL);
>> +  gimple new_stmt = gimple_build_assign_with_ops (NOP_EXPR, new_oprnd,
>> +  old_oprnd, NULL_TREE);
>> +  STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt)) = new_stmt;
>> +  stmts->safe_push (def_stmt);
>> +  *oprnd = new_oprnd;
>> + }
>> +    }
>> +
>>    /* Handle unsigned case.  Look for
>>       S6  u_prod_T = (unsigned TYPE) prod_T;
>>       Use unsigned TYPE as the type for WIDEN_MULT_EXPR.  */
>> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
>> index 72dfacd..e1ca2a2 100644
>> --- a/gcc/tree-vect-stmts.c
>> +++ b/gcc/tree-vect-stmts.c
>> @@ -2504,12 +2504,7 @@ vectorizable_conversion (gimple stmt,
>> gimple_stmt_iterator *gsi,
>>        if (supportable_widening_operation (code, stmt, vectype_out, vectype_in,
>>    &code1, &code2, &multi_step_cvt,
>>    &interm_types))
>> - {
>> -  /* Binary widening operation can only be supported directly by the
>> -     architecture.  */
>> -  gcc_assert (!(multi_step_cvt && op_type == binary_op));
>> -  break;
>> - }
>> + break;
>>
>>        if (code != FLOAT_EXPR
>>    || (GET_MODE_SIZE (TYPE_MODE (lhs_type))
>> @@ -2787,6 +2782,15 @@ vectorizable_conversion (gimple stmt,
>> gimple_stmt_iterator *gsi,
>>    c1 = codecvt1;
>>    c2 = codecvt2;
>>   }
>> +      else if (op_type == binary_op && i < multi_step_cvt)
>> + {
>> +  /* For binary widening operations, if multi-steps are needed,
>> +     use VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR to convert
>> +     intermediate result to final one.  */
>> +  c1 = VEC_UNPACK_LO_EXPR;
>> +  c2 = VEC_UNPACK_HI_EXPR;
>> +  op_type = unary_op;
>> + }
>>        vect_create_vectorized_promotion_stmts (&vec_oprnds0,
>>        &vec_oprnds1,
>>        stmt, this_dest, gsi,
>> @@ -6510,7 +6514,7 @@ supportable_widening_operation (enum tree_code
>> code, gimple stmt,
>>    enum tree_code c1, c2;
>>    int i;
>>    tree prev_type, intermediate_type;
>> -  enum machine_mode intermediate_mode, prev_mode;
>> +  enum machine_mode intermediate_mode;
>>    optab optab3, optab4;
>>
>>    *multi_step_cvt = 0;
>> @@ -6634,11 +6638,17 @@ supportable_widening_operation (enum tree_code
>> code, gimple stmt,
>>       types.  */
>>
>>    prev_type = vectype;
>> -  prev_mode = vec_mode;
>>
>> -  if (!CONVERT_EXPR_CODE_P (code))
>> +  /* For WIDEN_MULT_EXPR, it is possible that two steps are needed.  For
>> +     example, consider WIDEN_MULT_EXPR from char to int.  In the first step,
>> +     we can get the widen-mult result as short, and then widen it again to int.
>> +     VEC_UNPACK_LO_EXPR/VEC_UNPACK_HI_EXPR are used in the second step.  */
>> +  if (!CONVERT_EXPR_CODE_P (code) && code != WIDEN_MULT_EXPR)
>>      return false;
>>
>> +  c1 = VEC_UNPACK_LO_EXPR;
>> +  c2 = VEC_UNPACK_HI_EXPR;
>> +
>>    /* We assume here that there will not be more than MAX_INTERM_CVT_STEPS
>>       intermediate steps in promotion sequence.  We try
>>       MAX_INTERM_CVT_STEPS to get to NARROW_VECTYPE, and fail if we do
>> @@ -6654,10 +6664,6 @@ supportable_widening_operation (enum tree_code
>> code, gimple stmt,
>>        optab4 = optab_for_tree_code (c2, intermediate_type, optab_default);
>>
>>        if (!optab3 || !optab4
>> -          || (icode1 = optab_handler (optab1, prev_mode)) == CODE_FOR_nothing
>> -  || insn_data[icode1].operand[0].mode != intermediate_mode
>> -  || (icode2 = optab_handler (optab2, prev_mode)) == CODE_FOR_nothing
>> -  || insn_data[icode2].operand[0].mode != intermediate_mode
>>    || ((icode1 = optab_handler (optab3, intermediate_mode))
>>        == CODE_FOR_nothing)
>>    || ((icode2 = optab_handler (optab4, intermediate_mode))
>> @@ -6672,7 +6678,6 @@ supportable_widening_operation (enum tree_code
>> code, gimple stmt,
>>   return true;
>>
>>        prev_type = intermediate_type;
>> -      prev_mode = intermediate_mode;
>>      }
>>
>>    interm_types->release ();

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index f298c0b..12990b2 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@ 
+2013-12-02  Cong Hou  <congh@google.com>
+
+ * tree-vect-patterns.c (vect_recog_widen_mult_pattern): Enhance
+ the widen-mult pattern by handling two operands with different
+ sizes, and operands whose size is smaller than half of the result
+ type.
+
 2013-11-22  Jakub Jelinek  <jakub@redhat.com>

  PR sanitizer/59061
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 12d2c90..611ae1c 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@ 
+2013-12-02  Cong Hou  <congh@google.com>
+
+ * gcc.dg/vect/vect-widen-mult-u8-s16-s32.c: New test.
+ * gcc.dg/vect/vect-widen-mult-u8-u32.c: New test.
+
 2013-11-22  Jakub Jelinek  <jakub@redhat.com>

  * c-c++-common/asan/no-redundant-instrumentation-7.c: Fix
diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
new file mode 100644
index 0000000..9f9081b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-s16-s32.c
@@ -0,0 +1,48 @@ 
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int result[N];
+
+/* unsigned char * short -> int widening-mult.  */
+__attribute__ ((noinline)) int
+foo1(int len) {
+  int i;
+
+  for (i=0; i<len; i++) {
+    result[i] = X[i] * Y[i];
+  }
+}
+
+int main (void)
+{
+  int i;
+
+  check_vect ();
+
+  for (i=0; i<N; i++) {
+    X[i] = i;
+    Y[i] = 64-i;
+    __asm__ volatile ("");
+  }
+
+  foo1 (N);
+
+  for (i=0; i<N; i++) {
+    if (result[i] != X[i] * Y[i])
+      abort ();
+  }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
target { vect_widen_mult_hi_to_si || vect_unpack } } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern:
detected" 1 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
+/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" {
target vect_widen_mult_hi_to_si_pattern } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
new file mode 100644
index 0000000..12c4692
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-widen-mult-u8-u32.c
@@ -0,0 +1,48 @@ 
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 64
+
+unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+unsigned char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+unsigned int result[N];
+
+/* unsigned char-> unsigned int widening-mult.  */
+__attribute__ ((noinline)) int
+foo1(int len) {
+  int i;
+
+  for (i=0; i<len; i++) {
+    result[i] = X[i] * Y[i];
+  }
+}
+
+int main (void)
+{
+  int i;
+
+  check_vect ();
+
+  for (i=0; i<N; i++) {
+    X[i] = i;
+    Y[i] = 64-i;
+    __asm__ volatile ("");
+  }
+
+  foo1 (N);
+
+  for (i=0; i<N; i++) {
+    if (result[i] != X[i] * Y[i])
+      abort ();
+  }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
target { vect_widen_mult_qi_to_hi || vect_unpack } } } } */
+/* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern:
detected" 1 "vect" { target vect_widen_mult_qi_to_hi_pattern } } } */
+/* { dg-final { scan-tree-dump-times "pattern recognized" 1 "vect" {
target vect_widen_mult_qi_to_hi_pattern } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 7823cc3..f412e2d 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -529,7 +529,8 @@  vect_handle_widen_op_by_const (gimple stmt, enum
tree_code code,

    Try to find the following pattern:

-     type a_t, b_t;
+     type1 a_t;
+     type2 b_t;
      TYPE a_T, b_T, prod_T;