Patchwork [WIP] Vectorization using elemental functions

login
register
mail settings
Submitter Jakub Jelinek
Date Nov. 7, 2013, 8:48 p.m.
Message ID <20131107204808.GK27813@tucnak.zalov.cz>
Download mbox | patch
Permalink /patch/289480/
State New
Headers show

Comments

Jakub Jelinek - Nov. 7, 2013, 8:48 p.m.
Hi!

Here is a WIP.  As the cloning doesn't yet produce multiple arguments
for one arg (say for SSE2 variant and simdlen(8) int should be passed
in two V4SImode arguments, while currently it is passed in one vector(8) int
BLKmode one) nor returned in multiple vectors (any thoughts how to represent
that in the IL?  Returning an ARRAY_TYPE of VECTOR_TYPEs and handle that
specially in the backends?), this handles only some cases, like attached.
s1.c and s5.c seem to be vectorized nicely all the way through, s1.c
doesn't contain multiple type sizes nor anything complicated, s5.c
requires computation of V8SImode vector and then calling the elemental
clone twice, each time with V4SImode half of that vector as argument.
I'm using BIT_FIELD_REFs for that.  s3.c and s4.c require both
splitting an 8 unit vector into halves and also concatenating two 4 unit
vectors into one 8 unit one, for that I've tried to use a V8SImode
CONSTRUCTOR with two V4SImode SSA_NAMEs in it, but it seems expansion
ICEs on it:
                /* Vector CONSTRUCTORs should only be built from smaller
                   vectors in the case of BLKmode vectors.  */
                gcc_assert (TREE_CODE (TREE_TYPE (value)) != VECTOR_TYPE);
                RTVEC_ELT (vector, eltpos)
                  = expand_normal (value);
So, should I change expansion to handle this, or is there some other
preferred way to represent concatenation of 2 (or 4, 8, ...) smaller
vectors into one larger one?  Using BIT_FIELD_REFs on lhs on something
previously cleared would likely not result in best code.

Also, the s2.c testcase shows that elemental functions without return value
still need more work.  And the testcase also doesn't generate (for now all
ones) mask argument if the elemental is only in the inbranch version.

And, of course, besides adding some faster way to enumerate simd clones,
we want to add implicit target attributes to the simd clones and verify
whether we depending on the current compiler options can actually call such
a clone (is that check whether we could inline the elemental function into
the caller from target attributes POV, right?).


	Jakub
/* { dg-do options "-O3 -fopenmp -mavx2" } */

int array[1000];

#pragma omp declare simd simdlen(4) notinbranch
#pragma omp declare simd simdlen(4) notinbranch uniform(b) linear(c:3)
#pragma omp declare simd simdlen(8) notinbranch
#pragma omp declare simd simdlen(8) notinbranch uniform(b) linear(c:3)
__attribute__((noinline)) int
foo (int a, int b, int c)
{
  if (a == b + c)
    return 5;
  else
    return 6;
}

void
bar ()
{
  int i;
  for (i = 0; i < 1000; ++i)
    array[i] = foo (i, 123, i * 3);
  for (i = 0; i < 1000; ++i)
    array[i] = foo (i, array[i], i * 3);
}
/* { dg-options "-O3 -fopenmp -mavx2" } */

int d[1024], e[1024];

#pragma omp declare simd simdlen(4) notinbranch uniform(b) linear(c:3)
__attribute__((noinline)) long int
foo (int a, int b, int c)
{
  if (a == b + c)
    return 5L;
  else
    return 6L;
}

void
bar ()
{
  int i;
  for (i = 0; i < 1024; ++i)
    {
      d[i] = foo (i, 123, i * 3);
      e[i] = e[i] * 3;
    }
}
/* { dg-options "-O3 -fopenmp -mavx2" } */

int d[1024], e[1024];

#pragma omp declare simd simdlen(4) notinbranch uniform(b) linear(c:3)
__attribute__((noinline)) int
foo (int a, int b, int c)
{
  if (a == b + c)
    return 5;
  else
    return 6;
}

void
bar ()
{
  int i;
  for (i = 0; i < 1024; ++i)
    {
      d[i] = foo (i, 123, i * 3);
      e[i] = e[i] * 3;
    }
}
/* { dg-options "-O3 -fopenmp -mavx -mno-avx2" } */

float d[1024];
int e[1024];
unsigned short f[1024];

#pragma omp declare simd simdlen(8) notinbranch uniform(b)
__attribute__((noinline)) float
foo (float a, float b, float c)
{
  if (a == b + c)
    return 5.0f;
  else
    return 6.0f;
}

void
bar ()
{
  int i;
  for (i = 0; i < 1024; ++i)
    {
      d[i] = foo (i, 123, i * 3);
      e[i] = e[i] * 3;
      f[i] = f[i] + 1;
    }
}
int array[1000];

#pragma omp declare simd simdlen(4) notinbranch aligned(a:16) uniform(a) linear(b)
#pragma omp declare simd simdlen(4) notinbranch aligned(a:32) uniform(a) linear(b)
#pragma omp declare simd simdlen(8) notinbranch aligned(a:16) uniform(a) linear(b)
#pragma omp declare simd simdlen(8) notinbranch aligned(a:32) uniform(a) linear(b)
__attribute__((noinline)) void
foo (int *a, int b, int c)
{
  a[b] = c;
}

void
bar ()
{
  int i;
  for (i = 0; i < 1000; ++i)
    foo (array, i, i * array[i]);
}

Patch

--- gcc/tree-vectorizer.h.jj	2013-11-07 12:34:50.047501234 +0100
+++ gcc/tree-vectorizer.h	2013-11-07 12:37:17.742708618 +0100
@@ -416,6 +416,7 @@  enum stmt_vec_info_type {
   shift_vec_info_type,
   op_vec_info_type,
   call_vec_info_type,
+  call_simd_clone_vec_info_type,
   assignment_vec_info_type,
   condition_vec_info_type,
   reduc_vec_info_type,
--- gcc/tree-vect-stmts.c.jj	2013-11-07 12:34:50.095500978 +0100
+++ gcc/tree-vect-stmts.c	2013-11-07 21:01:23.364759827 +0100
@@ -37,6 +37,8 @@  along with GCC; see the file COPYING3.
 #include "tree-ssanames.h"
 #include "tree-ssa-loop-manip.h"
 #include "cfgloop.h"
+#include "tree-ssa-loop.h"
+#include "tree-scalar-evolution.h"
 #include "expr.h"
 #include "recog.h"		/* FIXME: for insn_data */
 #include "optabs.h"
@@ -1695,16 +1697,6 @@  tree
 vectorizable_function (gimple call, tree vectype_out, tree vectype_in)
 {
   tree fndecl = gimple_call_fndecl (call);
-  struct cgraph_node *node = cgraph_get_node (fndecl);
-
-  if (node->has_simd_clones)
-    {
-      struct cgraph_node *clone = get_simd_clone (node, vectype_out);
-      if (clone)
-	return clone->decl;
-      /* Fall through in case we ever add support for
-	 non-built-ins.  */
-    }
 
   /* We only handle functions that do not read or clobber memory -- i.e.
      const or novops ones.  */
@@ -1775,12 +1767,10 @@  vectorizable_call (gimple stmt, gimple_s
   vectype_in = NULL_TREE;
   nargs = gimple_call_num_args (stmt);
 
-  /* Bail out if the function has more than three arguments.  We do
-     not have interesting builtin functions to vectorize with more
-     than two arguments except for fma (unless we have SIMD clones).
-     No arguments is also not good.  */
-  struct cgraph_node *node = cgraph_get_node (gimple_call_fndecl (stmt));
-  if (nargs == 0 || (!node->has_simd_clones && nargs > 3))
+  /* Bail out if the function has more than three arguments, we do not have
+     interesting builtin functions to vectorize with more than two arguments
+     except for fma.  No arguments is also not good.  */
+  if (nargs == 0 || nargs > 3)
     return false;
 
   /* Ignore the argument of IFN_GOMP_SIMD_LANE, it is magic.  */
@@ -2143,6 +2133,496 @@  vectorizable_call (gimple stmt, gimple_s
 }
 
 
+struct simd_call_arg_info
+{
+  tree vectype;
+  tree op;
+  enum vect_def_type dt;
+  HOST_WIDE_INT linear_step;
+  unsigned int align;
+};
+
+/* Function vectorizable_simd_clone_call.
+
+   Check if STMT performs a function call that can be vectorized
+   by calling a simd clone of the function.
+   If VEC_STMT is also passed, vectorize the STMT: create a vectorized
+   stmt to replace it, put it in VEC_STMT, and insert it at BSI.
+   Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
+
+static bool
+vectorizable_simd_clone_call (gimple stmt, gimple_stmt_iterator *gsi,
+			      gimple *vec_stmt, slp_tree slp_node)
+{
+  tree vec_dest;
+  tree scalar_dest;
+  tree op, type;
+  tree vec_oprnd0 = NULL_TREE;
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
+  tree vectype;
+  unsigned int nunits;
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
+  struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
+  tree fndecl, new_temp, def;
+  gimple def_stmt;
+  gimple new_stmt = NULL;
+  int ncopies, j;
+  vec<simd_call_arg_info> arginfo = vNULL;
+  vec<tree> vargs = vNULL;
+  size_t i, nargs;
+  tree lhs, rtype;
+  vec<constructor_elt, va_gc> *ret_ctor_elts;
+
+  /* Is STMT a vectorizable call?   */
+  if (!is_gimple_call (stmt))
+    return false;
+
+  fndecl = gimple_call_fndecl (stmt);
+  if (fndecl == NULL_TREE)
+    return false;
+
+  struct cgraph_node *node = cgraph_get_node (fndecl);
+  if (node == NULL || !node->has_simd_clones)
+    return false;
+
+  if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
+    return false;
+
+  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
+    return false;
+
+  if (TREE_CODE (gimple_call_lhs (stmt)) != SSA_NAME)
+    return false;
+
+  if (stmt_can_throw_internal (stmt))
+    return false;
+
+  vectype = STMT_VINFO_VECTYPE (stmt_info);
+
+  if (loop_vinfo && nested_in_vect_loop_p (loop, stmt))
+    return false;
+
+  /* FORNOW */
+  if (slp_node || PURE_SLP_STMT (stmt_info))
+    return false;
+
+  /* Process function arguments.  */
+  nargs = gimple_call_num_args (stmt);
+
+  /* Bail out if the function has zero arguments.  */
+  if (nargs == 0)
+    return false;
+
+  arginfo.create (nargs);
+
+  for (i = 0; i < nargs; i++)
+    {
+      simd_call_arg_info thisarginfo;
+      affine_iv iv;
+
+      thisarginfo.linear_step = 0;
+      thisarginfo.align = 0;
+      thisarginfo.op = NULL_TREE;
+
+      op = gimple_call_arg (stmt, i);
+      if (!vect_is_simple_use_1 (op, stmt, loop_vinfo, bb_vinfo,
+				 &def_stmt, &def, &thisarginfo.dt,
+				 &thisarginfo.vectype))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                             "use not simple.\n");
+	  arginfo.release ();
+	  return false;
+	}
+
+      if (thisarginfo.vectype != NULL_TREE
+	  && loop_vinfo
+	  && TREE_CODE (op) == SSA_NAME
+	  && simple_iv (loop, loop_containing_stmt (stmt), op, &iv, false)
+          && host_integerp (iv.step, 0))
+	{
+	  thisarginfo.linear_step = tree_low_cst (iv.step, 0);
+	  thisarginfo.op = iv.base;
+	}
+      else if (thisarginfo.vectype == NULL_TREE
+	       && POINTER_TYPE_P (TREE_TYPE (op)))
+	thisarginfo.align = get_pointer_alignment (op) / BITS_PER_UNIT;
+
+      arginfo.quick_push (thisarginfo);
+    }
+
+  unsigned int badness = 0;
+  /* FIXME: Nasty kludge until we figure out where to put the clone
+     list-- perhaps, next_sibling_clone/prev_sibling_clone in
+     cgraph_node ??.  */
+  struct cgraph_node *bestn = NULL, *n;
+  FOR_EACH_FUNCTION (n)
+    if (n->simdclone_of == node)
+      {
+	unsigned int this_badness = 0;
+	if (n->simdclone->simdlen
+	    > (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+	    || n->simdclone->nargs != nargs)
+	  continue;
+	if (n->simdclone->simdlen
+	    < (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo))
+	  this_badness += (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
+			   - exact_log2 (n->simdclone->simdlen)) * 1024;
+	if (n->simdclone->inbranch)
+	  this_badness += 2048;
+	for (i = 0; i < nargs; i++)
+	  {
+	    switch (n->simdclone->args[i].arg_type)
+	      {
+	      case SIMD_CLONE_ARG_TYPE_VECTOR:
+		if (arginfo[i].vectype == NULL_TREE
+		    || arginfo[i].linear_step)
+		  this_badness += 64;
+		break;
+	      case SIMD_CLONE_ARG_TYPE_UNIFORM:
+		if (arginfo[i].vectype != NULL_TREE)
+		  i = -1;
+		break;
+	      case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
+		if (arginfo[i].vectype == NULL_TREE
+		    || (arginfo[i].linear_step
+			!= n->simdclone->args[i].linear_step))
+		  i = -1;
+		break;
+	      case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
+		/* FORNOW */
+		i = -1;
+		break;
+	      }
+	    if (i == (size_t) -1)
+	      break;
+	    if (n->simdclone->args[i].alignment > arginfo[i].align)
+	      {
+		i = -1;
+		break;
+	      }
+	    if (arginfo[i].align)
+	      this_badness += (exact_log2 (arginfo[i].align)
+			       - exact_log2 (n->simdclone->args[i].alignment));
+	  }
+	if (i == (size_t) -1)
+	  continue;
+	if (bestn == NULL || this_badness < badness)
+	  {
+	    bestn = n;
+	    badness = this_badness;
+	  }
+      }
+
+  if (bestn == NULL)
+    {
+      arginfo.release ();
+      return false;
+    }
+
+  fndecl = bestn->decl;
+  nunits = bestn->simdclone->simdlen;
+  ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
+
+  /* Sanity check: make sure that at least one copy of the vectorized stmt
+     needs to be generated.  */
+  gcc_assert (ncopies >= 1);
+
+  if (!vec_stmt) /* transformation not required.  */
+    {
+      STMT_VINFO_TYPE (stmt_info) = call_simd_clone_vec_info_type;
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_NOTE, vect_location,
+			 "=== vectorizable_simd_clone_call ===\n");
+/*      vect_model_simple_cost (stmt_info, ncopies, dt, NULL, NULL); */
+      arginfo.release ();
+      return true;
+    }
+
+  /** Transform.  **/
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location, "transform call.\n");
+
+  /* Handle def.  */
+  scalar_dest = gimple_call_lhs (stmt);
+  vec_dest = NULL_TREE;
+  rtype = NULL_TREE;
+  if (scalar_dest)
+    {
+      vec_dest = vect_create_destination_var (scalar_dest, vectype);
+      rtype = TREE_TYPE (TREE_TYPE (fndecl));
+    }
+
+  prev_stmt_info = NULL;
+  for (j = 0; j < ncopies; ++j)
+    {
+      /* Build argument list for the vectorized call.  */
+      if (j == 0)
+	vargs.create (nargs);
+      else
+	vargs.truncate (0);
+
+      for (i = 0; i < nargs; i++)
+	{
+	  unsigned int k, l;
+	  tree atype;
+	  op = gimple_call_arg (stmt, i);
+	  switch (bestn->simdclone->args[i].arg_type)
+	    {
+	    case SIMD_CLONE_ARG_TYPE_VECTOR:
+	      /* FIXME */
+	      atype = TREE_TYPE (bestn->simdclone->args[i].vector_arg);
+	      gcc_assert (TYPE_VECTOR_SUBPARTS (atype) == nunits);
+	      if (nunits < TYPE_VECTOR_SUBPARTS (arginfo[i].vectype))
+		{
+		  unsigned int prec = GET_MODE_BITSIZE (TYPE_MODE (atype));
+		  k = TYPE_VECTOR_SUBPARTS (arginfo[i].vectype) / nunits;
+		  gcc_assert ((k & (k - 1)) == 0);
+		  if (j == 0)
+		    vec_oprnd0
+		      = vect_get_vec_def_for_operand (op, stmt, NULL);
+		  else
+		    {
+		      vec_oprnd0 = arginfo[i].op;
+		      if ((j & (k - 1)) == 0)
+			vec_oprnd0
+			  = vect_get_vec_def_for_stmt_copy (arginfo[i].dt,
+							    vec_oprnd0);
+		    }
+		  arginfo[i].op = vec_oprnd0;
+		  vec_oprnd0 = build3 (BIT_FIELD_REF, atype, vec_oprnd0,
+				       build_int_cst (integer_type_node, prec),
+				       build_int_cst (integer_type_node,
+						      (j & (k - 1)) * prec));
+		  new_stmt
+		    = gimple_build_assign_with_ops (BIT_FIELD_REF,
+						    make_ssa_name (atype,
+								   NULL),
+						    vec_oprnd0, NULL_TREE);
+		  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+		  vargs.quick_push (gimple_assign_lhs (new_stmt));
+		  break;
+		}
+	      k = nunits / TYPE_VECTOR_SUBPARTS (arginfo[i].vectype);
+	      gcc_assert ((k & (k - 1)) == 0);
+	      vec<constructor_elt, va_gc> *ctor_elts;
+	      if (k != 1)
+		vec_alloc (ctor_elts, k);
+	      else
+		ctor_elts = NULL;
+	      for (l = 0; l < k; l++)
+		{
+		  if (j == 0 && l == 0)
+		    vec_oprnd0
+		      = vect_get_vec_def_for_operand (op, stmt, NULL);
+		  else
+		    vec_oprnd0
+		      = vect_get_vec_def_for_stmt_copy (arginfo[i].dt,
+							arginfo[i].op);
+		  arginfo[i].op = vec_oprnd0;
+		  if (k == 1)
+		    break;
+		  CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, vec_oprnd0);
+		}
+	      if (k == 1)
+		{
+		  vargs.quick_push (vec_oprnd0);
+		  break;
+		}
+	      vec_oprnd0 = build_constructor (atype, ctor_elts);
+	      new_stmt
+		= gimple_build_assign_with_ops (CONSTRUCTOR,
+						make_ssa_name (atype, NULL),
+						vec_oprnd0, NULL_TREE);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	      vargs.quick_push (gimple_assign_lhs (new_stmt));
+	      break;
+	    case SIMD_CLONE_ARG_TYPE_UNIFORM:
+	      vargs.quick_push (op);
+	      break;
+	    case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
+	      if (j == 0)
+		{
+		  gimple_seq stmts;
+		  arginfo[i].op
+		    = force_gimple_operand (arginfo[i].op, &stmts, true,
+					    NULL_TREE);
+		  if (stmts != NULL)
+		    {
+		      basic_block new_bb;
+		      edge pe = loop_preheader_edge (loop);
+		      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+		      gcc_assert (!new_bb);
+		    }
+		  tree phi_res = copy_ssa_name (op, NULL);
+		  gimple new_phi = create_phi_node (phi_res, loop->header);
+		  set_vinfo_for_stmt (new_phi,
+				      new_stmt_vec_info (new_phi, loop_vinfo,
+							 NULL));
+		  add_phi_arg (new_phi, arginfo[i].op,
+			       loop_preheader_edge (loop), UNKNOWN_LOCATION);
+		  enum tree_code code
+		    = POINTER_TYPE_P (TREE_TYPE (op))
+		      ? POINTER_PLUS_EXPR : PLUS_EXPR;
+		  tree type = POINTER_TYPE_P (TREE_TYPE (op))
+			      ? sizetype : TREE_TYPE (op);
+		  double_int cst
+		    = double_int::from_shwi (arginfo[i].linear_step);
+		  cst *= double_int::from_uhwi (ncopies * nunits);
+		  tree tcst = double_int_to_tree (type, cst);
+		  tree phi_arg = copy_ssa_name (op, NULL);
+		  new_stmt = gimple_build_assign_with_ops (code, phi_arg,
+							   phi_res, tcst);
+		  gimple_stmt_iterator si = gsi_after_labels (loop->header);
+		  gsi_insert_after (&si, new_stmt, GSI_NEW_STMT);
+		  set_vinfo_for_stmt (new_stmt,
+				      new_stmt_vec_info (new_stmt, loop_vinfo,
+							 NULL));
+		  add_phi_arg (new_phi, phi_arg, loop_latch_edge (loop),
+			       UNKNOWN_LOCATION);
+		  arginfo[i].op = phi_res;
+		  vargs.quick_push (phi_res);
+		}
+	      else
+		{
+		  enum tree_code code
+		    = POINTER_TYPE_P (TREE_TYPE (op))
+		      ? POINTER_PLUS_EXPR : PLUS_EXPR;
+		  tree type = POINTER_TYPE_P (TREE_TYPE (op))
+			      ? sizetype : TREE_TYPE (op);
+		  double_int cst
+		    = double_int::from_shwi (arginfo[i].linear_step);
+		  cst *= double_int::from_uhwi (j * nunits);
+		  tree tcst = double_int_to_tree (type, cst);
+		  new_temp = make_ssa_name (TREE_TYPE (op), NULL);
+		  new_stmt
+		    = gimple_build_assign_with_ops (code, new_temp,
+						    arginfo[i].op, tcst);
+		  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+		  vargs.quick_push (new_temp);
+		}
+	      break;
+	    case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
+	    default:
+	      gcc_unreachable ();
+	    }
+	}
+
+      new_stmt = gimple_build_call_vec (fndecl, vargs);
+      if (vec_dest)
+	{
+	  gcc_assert (TYPE_VECTOR_SUBPARTS (rtype) == nunits);
+	  if (TYPE_VECTOR_SUBPARTS (vectype) == TYPE_VECTOR_SUBPARTS (rtype))
+	    new_temp = make_ssa_name (vec_dest, new_stmt);
+	  else
+	    new_temp = make_ssa_name (rtype, new_stmt);
+	  gimple_call_set_lhs (new_stmt, new_temp);
+	}
+      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+      if (vec_dest)
+	{
+	  if (TYPE_VECTOR_SUBPARTS (vectype) < TYPE_VECTOR_SUBPARTS (rtype))
+	    {
+	      unsigned int k, l;
+	      unsigned int prec = GET_MODE_BITSIZE (TYPE_MODE (vectype));
+	      k = (TYPE_VECTOR_SUBPARTS (rtype)
+		   / TYPE_VECTOR_SUBPARTS (vectype));
+	      gcc_assert ((k & (k - 1)) == 0);
+	      for (l = 0; l < k; l++)
+		{
+		  tree t = build3 (BIT_FIELD_REF, vectype, new_temp,
+				   build_int_cst (integer_type_node, prec),
+				   build_int_cst (integer_type_node,
+						  l * prec));
+		  new_stmt
+		    = gimple_build_assign_with_ops (BIT_FIELD_REF,
+						    make_ssa_name (vectype,
+								   NULL),
+						    t, NULL_TREE);
+		  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+		  if (j == 0 && l == 0)
+		    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+		  else
+		    STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+
+		  prev_stmt_info = vinfo_for_stmt (new_stmt);
+		}
+	      continue;
+	    }
+	  else if (TYPE_VECTOR_SUBPARTS (vectype)
+		   > TYPE_VECTOR_SUBPARTS (rtype))
+	    {
+	      unsigned int k = (TYPE_VECTOR_SUBPARTS (vectype)
+				/ TYPE_VECTOR_SUBPARTS (rtype));
+	      gcc_assert ((k & (k - 1)) == 0);
+	      if ((j & (k - 1)) == 0)
+		vec_alloc (ret_ctor_elts, k);
+	      CONSTRUCTOR_APPEND_ELT (ret_ctor_elts, NULL_TREE, new_temp);
+	      if ((j & (k - 1)) != k - 1)
+		continue;
+	      vec_oprnd0 = build_constructor (vectype, ret_ctor_elts);
+	      new_stmt
+		= gimple_build_assign_with_ops (CONSTRUCTOR,
+						make_ssa_name (vec_dest, NULL),
+						vec_oprnd0, NULL_TREE);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+	      if ((unsigned) j == k - 1)
+		STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+	      else
+		STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+
+	      prev_stmt_info = vinfo_for_stmt (new_stmt);
+	      continue;
+	    }
+	}
+
+      if (j == 0)
+	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+      else
+	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
+
+      prev_stmt_info = vinfo_for_stmt (new_stmt);
+    }
+
+  vargs.release ();
+
+  /* Update the exception handling table with the vector stmt if necessary.  */
+  if (maybe_clean_or_replace_eh_stmt (stmt, *vec_stmt))
+    gimple_purge_dead_eh_edges (gimple_bb (stmt));
+
+  /* The call in STMT might prevent it from being removed in dce.
+     We however cannot remove it here, due to the way the ssa name
+     it defines is mapped to the new definition.  So just replace
+     rhs of the statement with something harmless.  */
+
+  if (slp_node)
+    return true;
+
+  if (scalar_dest)
+    {
+      type = TREE_TYPE (scalar_dest);
+      if (is_pattern_stmt_p (stmt_info))
+	lhs = gimple_call_lhs (STMT_VINFO_RELATED_STMT (stmt_info));
+      else
+	lhs = gimple_call_lhs (stmt);
+      new_stmt = gimple_build_assign (lhs, build_zero_cst (type));
+    }
+  else
+    new_stmt = gimple_build_nop ();
+  set_vinfo_for_stmt (new_stmt, stmt_info);
+  set_vinfo_for_stmt (stmt, NULL);
+  STMT_VINFO_STMT (stmt_info) = new_stmt;
+  gsi_replace (gsi, new_stmt, false);
+
+  return true;
+}
+
+
 /* Function vect_gen_widened_results_half
 
    Create a vector stmt whose code, type, number of arguments, and result
@@ -5869,6 +6349,7 @@  vect_analyze_stmt (gimple stmt, bool *ne
             || vectorizable_assignment (stmt, NULL, NULL, NULL)
             || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
 	    || vectorizable_call (stmt, NULL, NULL, NULL)
+	    || vectorizable_simd_clone_call (stmt, NULL, NULL, NULL)
             || vectorizable_store (stmt, NULL, NULL, NULL)
             || vectorizable_reduction (stmt, NULL, NULL, NULL)
             || vectorizable_condition (stmt, NULL, NULL, NULL, 0, NULL));
@@ -5881,6 +6362,7 @@  vect_analyze_stmt (gimple stmt, bool *ne
                 || vectorizable_assignment (stmt, NULL, NULL, node)
                 || vectorizable_load (stmt, NULL, NULL, node, NULL)
 		|| vectorizable_call (stmt, NULL, NULL, node)
+		|| vectorizable_simd_clone_call (stmt, NULL, NULL, node)
                 || vectorizable_store (stmt, NULL, NULL, node)
                 || vectorizable_condition (stmt, NULL, NULL, NULL, 0, node));
       }
@@ -6003,6 +6485,11 @@  vect_transform_stmt (gimple stmt, gimple
       stmt = gsi_stmt (*gsi);
       break;
 
+    case call_simd_clone_vec_info_type:
+      done = vectorizable_simd_clone_call (stmt, gsi, &vec_stmt, slp_node);
+      stmt = gsi_stmt (*gsi);
+      break;
+
     case reduc_vec_info_type:
       done = vectorizable_reduction (stmt, gsi, &vec_stmt, slp_node);
       gcc_assert (done);