diff mbox series

remove SLP_TREE_TWO_OPERATORS, add SLP permutation node

Message ID nycvar.YFH.7.76.2006171641120.4397@zhemvz.fhfr.qr
State New
Headers show
Series remove SLP_TREE_TWO_OPERATORS, add SLP permutation node | expand

Commit Message

Richard Biener June 17, 2020, 2:45 p.m. UTC
This removes the SLP_TREE_TWO_OPERATORS hack in favor of having
explicit SLP nodes for both computations and the blend operation.
For this introduce a generic merge + select + permute SLP node
(with implementation limits).

Building upon earlier patches it adds vect_stmt_dominates_stmt_p
and the ability to compute a vector insertion place from
vectorized stmts (which now have UID zero) as needed for
the permute node.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

When generalizing this to fully replace vect_transform_slp_perm_load
I need help with SVE and variable-size vectors.  I'll send the
(incomplete) followup to do this tomorrow (hopefully) together
with some larger outline and open questions.  For now this only
performs blends for the former SLP_TREE_TWO_OPERATORS - not sure
of SVE handled all cases here, the following does not.  I'm still
planning to install it and fix up afterwards.

Thanks,
Richard.

2020-06-17  Richard Biener  <rguenther@suse.de>

	* tree-vectorizer.h (_slp_tree::two_operators): Remove.
	(_slp_tree::lane_permutation): New member.
	(_slp_tree::code): Likewise.
	(SLP_TREE_TWO_OPERATORS): Remove.
	(SLP_TREE_LANE_PERMUTATION): New.
	(SLP_TREE_CODE): Likewise.
	(vect_stmt_dominates_stmt_p): Declare.
	* tree-vectorizer.c (vect_stmt_dominates_stmt_p): New function.
	* tree-vect-stmts.c (vect_model_simple_cost): Remove
	SLP_TREE_TWO_OPERATORS handling.
	* tree-vect-slp.c (_slp_tree::_slp_tree): Amend.
	(_slp_tree::~_slp_tree): Likewise.
	(vect_two_operations_perm_ok_p): Remove.
	(vect_build_slp_tree_1): Remove verification of two-operator
	permutation here.
	(vect_build_slp_tree_2): When we have two different operators
	build two computation SLP nodes and a blend.
	(vect_print_slp_tree): Print the lane permutation if it exists.
	(slp_copy_subtree): Copy it.
	(vect_slp_rearrange_stmts): Re-arrange it.
	(vect_slp_analyze_node_operations_1): Handle SLP_TREE_CODE
	VEC_PERM_EXPR explicitely.
	(vect_schedule_slp_instance): Likewise.  Remove old
	SLP_TREE_TWO_OPERATORS code.
	(vectorizable_slp_permutation): New function.
---
 gcc/tree-vect-slp.c   | 442 +++++++++++++++++++++++++++++-------------
 gcc/tree-vect-stmts.c |   8 -
 gcc/tree-vectorizer.c |  57 ++++++
 gcc/tree-vectorizer.h |  13 +-
 4 files changed, 378 insertions(+), 142 deletions(-)
diff mbox series

Patch

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index fe946738a97..e33b42fbc68 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -47,6 +47,9 @@  along with GCC; see the file COPYING3.  If not see
 #include "dump-context.h"
 
 
+static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
+					  slp_tree, stmt_vector_for_cost *);
+
 /* Initialize a SLP node.  */
 
 _slp_tree::_slp_tree ()
@@ -58,8 +61,9 @@  _slp_tree::_slp_tree ()
   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
   SLP_TREE_CHILDREN (this) = vNULL;
   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
-  SLP_TREE_TWO_OPERATORS (this) = false;
+  SLP_TREE_LANE_PERMUTATION (this) = vNULL;
   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
+  SLP_TREE_CODE (this) = ERROR_MARK;
   SLP_TREE_VECTYPE (this) = NULL_TREE;
   SLP_TREE_REPRESENTATIVE (this) = NULL;
   this->refcnt = 1;
@@ -77,6 +81,7 @@  _slp_tree::~_slp_tree ()
   SLP_TREE_VEC_STMTS (this).release ();
   SLP_TREE_VEC_DEFS (this).release ();
   SLP_TREE_LOAD_PERMUTATION (this).release ();
+  SLP_TREE_LANE_PERMUTATION (this).release ();
 }
 
 /* Recursively free the memory allocated for the SLP tree rooted at NODE.
@@ -697,35 +702,6 @@  vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
   return true;
 }
 
-/* STMTS is a group of GROUP_SIZE SLP statements in which some
-   statements do the same operation as the first statement and in which
-   the others do ALT_STMT_CODE.  Return true if we can take one vector
-   of the first operation and one vector of the second and permute them
-   to get the required result.  VECTYPE is the type of the vector that
-   would be permuted.  */
-
-static bool
-vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
-			       unsigned int group_size, tree vectype,
-			       tree_code alt_stmt_code)
-{
-  unsigned HOST_WIDE_INT count;
-  if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&count))
-    return false;
-
-  vec_perm_builder sel (count, count, 1);
-  for (unsigned int i = 0; i < count; ++i)
-    {
-      unsigned int elt = i;
-      gassign *stmt = as_a <gassign *> (stmts[i % group_size]->stmt);
-      if (gimple_assign_rhs_code (stmt) == alt_stmt_code)
-	elt += count;
-      sel.quick_push (elt);
-    }
-  vec_perm_indices indices (sel, 2, count);
-  return can_vec_perm_const_p (TYPE_MODE (vectype), indices);
-}
-
 /* Verify if the scalar stmts STMTS are isomorphic, require data
    permutation or are of unsupported types of operation.  Return
    true if they are, otherwise return false and indicate in *MATCHES
@@ -1089,24 +1065,6 @@  vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
   if (alt_stmt_code != ERROR_MARK
       && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
     {
-      if (!vect_two_operations_perm_ok_p (stmts, group_size,
-					  vectype, alt_stmt_code))
-	{
-	  for (i = 0; i < group_size; ++i)
-	    if (gimple_assign_rhs_code (stmts[i]->stmt) == alt_stmt_code)
-	      {
-		matches[i] = false;
-		if (dump_enabled_p ())
-		  {
-		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				     "Build SLP failed: different operation "
-				     "in stmt %G", stmts[i]->stmt);
-		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				     "original stmt %G", first_stmt_info->stmt);
-		  }
-	      }
-	  return false;
-	}
       *two_operators = true;
     }
 
@@ -1512,8 +1470,59 @@  fail:
   *tree_size += this_tree_size + 1;
   *max_nunits = this_max_nunits;
 
+  if (two_operators)
+    {
+      /* ???  We'd likely want to either cache in bst_map sth like
+	 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
+	 the true { a+b, a+b, a+b, a+b } ... but there we don't have
+	 explicit stmts to put in so the keying on 'stmts' doesn't
+	 work (but we have the same issue with nodes that use 'ops').  */
+      slp_tree one = new _slp_tree;
+      slp_tree two = new _slp_tree;
+      SLP_TREE_DEF_TYPE (one) = vect_internal_def;
+      SLP_TREE_DEF_TYPE (two) = vect_internal_def;
+      SLP_TREE_VECTYPE (one) = vectype;
+      SLP_TREE_VECTYPE (two) = vectype;
+      SLP_TREE_CHILDREN (one).safe_splice (children);
+      SLP_TREE_CHILDREN (two).safe_splice (children);
+      slp_tree child;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
+	child->refcnt++;
+
+      /* Here we record the original defs since this
+	 node represents the final lane configuration.  */
+      node = vect_create_new_slp_node (stmts, 2);
+      SLP_TREE_VECTYPE (node) = vectype;
+      SLP_TREE_CODE (node) = VEC_PERM_EXPR;
+      SLP_TREE_CHILDREN (node).quick_push (one);
+      SLP_TREE_CHILDREN (node).quick_push (two);
+      gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
+      enum tree_code code0 = gimple_assign_rhs_code (stmt);
+      enum tree_code ocode = ERROR_MARK;
+      stmt_vec_info ostmt_info;
+      unsigned j = 0;
+      FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
+	{
+	  gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
+	  if (gimple_assign_rhs_code (ostmt) != code0)
+	    {
+	      SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
+	      ocode = gimple_assign_rhs_code (ostmt);
+	      j = i;
+	    }
+	  else
+	    SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
+	}
+      SLP_TREE_CODE (one) = code0;
+      SLP_TREE_CODE (two) = ocode;
+      SLP_TREE_LANES (one) = stmts.length ();
+      SLP_TREE_LANES (two) = stmts.length ();
+      SLP_TREE_REPRESENTATIVE (one) = stmts[0];
+      SLP_TREE_REPRESENTATIVE (two) = stmts[j];
+      return node;
+    }
+
   node = vect_create_new_slp_node (stmts, nops);
-  SLP_TREE_TWO_OPERATORS (node) = two_operators;
   SLP_TREE_VECTYPE (node) = vectype;
   SLP_TREE_CHILDREN (node).splice (children);
   return node;
@@ -1557,6 +1566,15 @@  vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
 	dump_printf (dump_kind, " %u", j);
       dump_printf (dump_kind, " }\n");
     }
+  if (SLP_TREE_LANE_PERMUTATION (node).exists ())
+    {
+      dump_printf_loc (metadata, user_loc, "\tlane permutation {");
+      for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
+	dump_printf (dump_kind, " %u[%u]",
+		     SLP_TREE_LANE_PERMUTATION (node)[i].first,
+		     SLP_TREE_LANE_PERMUTATION (node)[i].second);
+      dump_printf (dump_kind, " }\n");
+    }
   if (SLP_TREE_CHILDREN (node).is_empty ())
     return;
   dump_printf_loc (metadata, user_loc, "\tchildren");
@@ -1693,6 +1711,8 @@  slp_copy_subtree (slp_tree node, hash_map<slp_tree, slp_tree> &map)
     SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy ();
   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
     SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy ();
+  if (SLP_TREE_LANE_PERMUTATION (node).exists ())
+    SLP_TREE_LANE_PERMUTATION (copy) = SLP_TREE_LANE_PERMUTATION (node).copy ();
   if (SLP_TREE_CHILDREN (node).exists ())
     SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy ();
   gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ());
@@ -1746,6 +1766,13 @@  vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
       SLP_TREE_SCALAR_OPS (node).release ();
       SLP_TREE_SCALAR_OPS (node) = tmp_ops;
     }
+  if (SLP_TREE_LANE_PERMUTATION (node).exists ())
+    {
+      gcc_assert (group_size == SLP_TREE_LANE_PERMUTATION (node).length ());
+      for (i = 0; i < group_size; ++i)
+	SLP_TREE_LANE_PERMUTATION (node)[i].second
+	  = permutation[SLP_TREE_LANE_PERMUTATION (node)[i].second];
+    }
 }
 
 
@@ -2632,6 +2659,10 @@  vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
 	= vect_get_num_vectors (vf * group_size, vectype);
     }
 
+  /* Handle purely internal nodes.  */
+  if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
+    return vectorizable_slp_permutation (vinfo, NULL, node, cost_vec);
+
   bool dummy;
   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
 			    node, node_instance, cost_vec);
@@ -3933,6 +3964,197 @@  vect_transform_slp_perm_load (vec_info *vinfo,
   return true;
 }
 
+
+/* Vectorize the SLP permutations in NODE as specified
+   in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
+   child number and lane number.
+   Interleaving of two two-lane two-child SLP subtrees (not supported):
+     [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
+   A blend of two four-lane two-child SLP subtrees:
+     [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
+   Highpart of a four-lane one-child SLP subtree (not supported):
+     [ { 0, 2 }, { 0, 3 } ]
+   Where currently only a subset is supported by code generating below.  */
+
+static bool
+vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
+			      slp_tree node, stmt_vector_for_cost *cost_vec)
+{
+  tree vectype = SLP_TREE_VECTYPE (node);
+
+  /* ???  We currently only support all same vector input and output types
+     while the SLP IL should really do a concat + select and thus accept
+     arbitrary mismatches.  */
+  slp_tree child;
+  unsigned i;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    if (!types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
+      {
+	if (dump_enabled_p ())
+	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			   "Unsupported lane permutation\n");
+	return false;
+      }
+
+  vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION (node);
+  gcc_assert (perm.length () == SLP_TREE_LANES (node));
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+		       "vectorizing permutation");
+      for (unsigned i = 0; i < perm.length (); ++i)
+	dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
+      dump_printf (MSG_NOTE, "\n");
+    }
+
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  if (!nunits.is_constant ())
+    return false;
+  unsigned HOST_WIDE_INT vf = 1;
+  if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
+    if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
+      return false;
+  unsigned olanes = vf * SLP_TREE_LANES (node);
+  gcc_assert (multiple_p (olanes, nunits));
+
+  /* Compute the { { SLP operand, vector index}, lane } permutation sequence
+     from the { SLP operand, scalar lane } permutation as recorded in the
+     SLP node as intermediate step.  This part should already work
+     with SLP children with arbitrary number of lanes.  */
+  auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
+  auto_vec<unsigned> active_lane;
+  vperm.create (olanes);
+  active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length ());
+  for (unsigned i = 0; i < vf; ++i)
+    {
+      for (unsigned pi = 0; pi < perm.length (); ++pi)
+	{
+	  std::pair<unsigned, unsigned> p = perm[pi];
+	  tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
+	  unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
+	  unsigned vi = (active_lane[p.first] + p.second) / vnunits;
+	  unsigned vl = (active_lane[p.first] + p.second) % vnunits;
+	  vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), vl));
+	}
+      /* Advance to the next group.  */
+      for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
+	active_lane[j] += SLP_TREE_LANES (SLP_TREE_CHILDREN (node)[j]);
+    }
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location, "as");
+      for (unsigned i = 0; i < vperm.length (); ++i)
+	{
+	  if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
+	    dump_printf (MSG_NOTE, ",");
+	  dump_printf (MSG_NOTE, " vops%u[%u][%u]",
+		       vperm[i].first.first, vperm[i].first.second,
+		       vperm[i].first.second);
+	}
+      dump_printf (MSG_NOTE, "\n");
+    }
+
+  /* We can only handle two-vector permutes, everything else should
+     be lowered on the SLP level.  The following is closely inspired
+     by vect_transform_slp_perm_load and is supposed to eventually
+     replace it.
+     ???   As intermediate step do code-gen in the SLP tree representation
+     somehow?  */
+  std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
+  std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
+  unsigned int const_nunits = nunits.to_constant ();
+  unsigned int index = 0;
+  unsigned int mask_element;
+  vec_perm_builder mask;
+  mask.new_vector (const_nunits, const_nunits, 1);
+  unsigned int count = mask.encoded_nelts ();
+  mask.quick_grow (count);
+  vec_perm_indices indices;
+  unsigned nperms = 0;
+  for (unsigned i = 0; i < vperm.length (); ++i)
+    {
+      mask_element = vperm[i].second;
+      if (first_vec.first == -1U
+	  || first_vec == vperm[i].first)
+	first_vec = vperm[i].first;
+      else if (second_vec.first == -1U
+	       || second_vec == vperm[i].first)
+	{
+	  second_vec = vperm[i].first;
+	  mask_element += const_nunits;
+	}
+      else
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "permutation requires at "
+			     "least three vectors");
+	  gcc_assert (!gsi);
+	  return false;
+	}
+
+      mask[index++] = mask_element;
+
+      if (index == count)
+	{
+	  indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
+			      const_nunits);
+	  if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
+	    {
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+				   vect_location,
+				   "unsupported vect permute { ");
+		  for (i = 0; i < count; ++i)
+		    {
+		      dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
+		      dump_printf (MSG_MISSED_OPTIMIZATION, " ");
+		    }
+		  dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
+		}
+	      gcc_assert (!gsi);
+	      return false;
+	    }
+
+	  nperms++;
+	  if (gsi)
+	    {
+	      tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
+
+	      if (second_vec.first == -1U)
+		second_vec = first_vec;
+
+	      /* Generate the permute statement if necessary.  */
+	      slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
+	      tree first_def
+		= vect_get_slp_vect_def (first_node, first_vec.second);
+	      slp_tree second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
+	      tree second_def
+		= vect_get_slp_vect_def (second_node, second_vec.second);
+	      tree perm_dest = make_ssa_name (vectype);
+	      gassign *perm_stmt
+		= gimple_build_assign (perm_dest, VEC_PERM_EXPR,
+				       first_def, second_def,
+				       mask_vec);
+	      vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
+	      /* Store the vector statement in NODE.  */
+	      SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
+	    }
+
+	  index = 0;
+	  first_vec = std::make_pair (-1U, -1U);
+	  second_vec = std::make_pair (-1U, -1U);
+	}
+    }
+
+  if (!gsi)
+    record_stmt_cost (cost_vec, nperms, vec_perm, NULL, vectype, 0, vect_body);
+
+  return true;
+}
+
 /* Vectorize SLP instance tree in postorder.  */
 
 static void
@@ -3967,16 +4189,10 @@  vect_schedule_slp_instance (vec_info *vinfo,
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     vect_schedule_slp_instance (vinfo, child, instance);
 
-  stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
-
-  /* VECTYPE is the type of the destination.  */
-  tree vectype = SLP_TREE_VECTYPE (node);
-  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
-  unsigned group_size = SLP_TREE_SCALAR_STMTS (node).length ();
-
   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
   SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
 
+  stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
 		     "------>vectorizing SLP node starting from: %G",
@@ -3985,84 +4201,50 @@  vect_schedule_slp_instance (vec_info *vinfo,
   /* Vectorized stmts go before the last scalar stmt which is where
      all uses are ready.  */
   stmt_vec_info last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
-  si = gsi_for_stmt (last_stmt_info->stmt);
-
-  /* Handle two-operation SLP nodes by vectorizing the group with
-     both operations and then performing a merge.  */
-  bool done_p = false;
-  if (SLP_TREE_TWO_OPERATORS (node))
+  if (last_stmt_info)
+    si = gsi_for_stmt (last_stmt_info->stmt);
+  else
     {
-      gassign *stmt = as_a <gassign *> (stmt_info->stmt);
-      enum tree_code code0 = gimple_assign_rhs_code (stmt);
-      enum tree_code ocode = ERROR_MARK;
-      stmt_vec_info ostmt_info;
-      vec_perm_builder mask (group_size, group_size, 1);
-      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt_info)
-	{
-	  gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
-	  if (gimple_assign_rhs_code (ostmt) != code0)
-	    {
-	      mask.quick_push (1);
-	      ocode = gimple_assign_rhs_code (ostmt);
-	    }
-	  else
-	    mask.quick_push (0);
-	}
-      if (ocode != ERROR_MARK)
+      /* Or if we do not have 1:1 matching scalar stmts emit after the
+	 children vectorized defs.  */
+      gimple *last_in_child;
+      gimple *last_stmt = NULL;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+	/* ???  With only external defs the following breaks.  Note
+	   external defs do not get all vector init stmts generated
+	   at the same place.  */
+	if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
+	  {
+	    /* We are emitting all vectorized stmts in the same place and
+	       the last one is the last.  */
+	    last_in_child = SLP_TREE_VEC_STMTS (child).last ();
+	    if (!last_stmt
+		|| vect_stmt_dominates_stmt_p (last_stmt, last_in_child))
+	      last_stmt = last_in_child;
+	  }
+      if (is_a <gphi *> (last_stmt))
+	si = gsi_after_labels (gimple_bb (last_stmt));
+      else
 	{
-	  vec<gimple *> v0;
-	  vec<gimple *> v1;
-	  unsigned j;
-	  tree tmask = NULL_TREE;
-	  vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
-	  v0 = SLP_TREE_VEC_STMTS (node).copy ();
-	  SLP_TREE_VEC_STMTS (node).truncate (0);
-	  gimple_assign_set_rhs_code (stmt, ocode);
-	  vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
-	  gimple_assign_set_rhs_code (stmt, code0);
-	  v1 = SLP_TREE_VEC_STMTS (node).copy ();
-	  SLP_TREE_VEC_STMTS (node).truncate (0);
-	  tree meltype = build_nonstandard_integer_type
-	      (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype))), 1);
-	  tree mvectype = get_same_sized_vectype (meltype, vectype);
-	  unsigned k = 0, l;
-	  for (j = 0; j < v0.length (); ++j)
-	    {
-	      /* Enforced by vect_build_slp_tree, which rejects variable-length
-		 vectors for SLP_TREE_TWO_OPERATORS.  */
-	      unsigned int const_nunits = nunits.to_constant ();
-	      tree_vector_builder melts (mvectype, const_nunits, 1);
-	      for (l = 0; l < const_nunits; ++l)
-		{
-		  if (k >= group_size)
-		    k = 0;
-		  tree t = build_int_cst (meltype,
-					  mask[k++] * const_nunits + l);
-		  melts.quick_push (t);
-		}
-	      tmask = melts.build ();
-
-	      /* ???  Not all targets support a VEC_PERM_EXPR with a
-	         constant mask that would translate to a vec_merge RTX
-		 (with their vec_perm_const_ok).  We can either not
-		 vectorize in that case or let veclower do its job.
-		 Unfortunately that isn't too great and at least for
-		 plus/minus we'd eventually like to match targets
-		 vector addsub instructions.  */
-	      gimple *vstmt;
-	      vstmt = gimple_build_assign (make_ssa_name (vectype),
-					   VEC_PERM_EXPR,
-					   gimple_assign_lhs (v0[j]),
-					   gimple_assign_lhs (v1[j]),
-					   tmask);
-	      vect_finish_stmt_generation (vinfo, stmt_info, vstmt, &si);
-	      SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
-	    }
-	  v0.release ();
-	  v1.release ();
-	  done_p = true;
+	  si = gsi_for_stmt (last_stmt);
+	  gsi_next (&si);
 	}
     }
+
+  bool done_p = false;
+
+  /* Handle purely internal nodes.  */
+  if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
+    {
+      /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
+	 be shared with different SLP nodes (but usually it's the same
+	 operation apart from the case the stmt is only there for denoting
+	 the actual scalar lane defs ...).  So do not call vect_transform_stmt
+	 but open-code it here (partly).  */
+      bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
+      gcc_assert (done);
+      done_p = true;
+    }
   if (!done_p)
     vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
 }
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 4a0a907fcb4..b134b39c07c 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -820,14 +820,6 @@  vect_model_simple_cost (vec_info *,
 	prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
 					   stmt_info, 0, vect_prologue);
 
-  /* Adjust for two-operator SLP nodes.  */
-  if (node && SLP_TREE_TWO_OPERATORS (node))
-    {
-      ncopies *= 2;
-      inside_cost += record_stmt_cost (cost_vec, ncopies, vec_perm,
-				       stmt_info, 0, vect_body);
-    }
-
   /* Pass the inside-of-loop statements to the target-specific cost model.  */
   inside_cost += record_stmt_cost (cost_vec, ncopies, kind,
 				   stmt_info, 0, vect_body);
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 76cfba5d497..e262ba0580e 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -711,6 +711,63 @@  vec_info::free_stmt_vec_info (stmt_vec_info stmt_info)
   free (stmt_info);
 }
 
+/* Returns true if S1 dominates S2.  */
+
+bool
+vect_stmt_dominates_stmt_p (gimple *s1, gimple *s2)
+{
+  basic_block bb1 = gimple_bb (s1), bb2 = gimple_bb (s2);
+
+  /* If bb1 is NULL, it should be a GIMPLE_NOP def stmt of an (D)
+     SSA_NAME.  Assume it lives at the beginning of function and
+     thus dominates everything.  */
+  if (!bb1 || s1 == s2)
+    return true;
+
+  /* If bb2 is NULL, it doesn't dominate any stmt with a bb.  */
+  if (!bb2)
+    return false;
+
+  if (bb1 != bb2)
+    return dominated_by_p (CDI_DOMINATORS, bb2, bb1);
+
+  /* PHIs in the same basic block are assumed to be
+     executed all in parallel, if only one stmt is a PHI,
+     it dominates the other stmt in the same basic block.  */
+  if (gimple_code (s1) == GIMPLE_PHI)
+    return true;
+
+  if (gimple_code (s2) == GIMPLE_PHI)
+    return false;
+
+  /* Inserted vectorized stmts all have UID 0 while the original stmts
+     in the IL have UID increasing within a BB.  Walk from both sides
+     until we find the other stmt or a stmt with UID != 0.  */
+  gimple_stmt_iterator gsi1 = gsi_for_stmt (s1);
+  while (gimple_uid (gsi_stmt (gsi1)) == 0)
+    {
+      gsi_next (&gsi1);
+      if (gsi_end_p (gsi1))
+	return false;
+      if (gsi_stmt (gsi1) == s2)
+	return true;
+    }
+
+  gimple_stmt_iterator gsi2 = gsi_for_stmt (s2);
+  while (gimple_uid (gsi_stmt (gsi2)) == 0)
+    {
+      gsi_prev (&gsi2);
+      if (gsi_end_p (gsi2))
+	return false;
+      if (gsi_stmt (gsi2) == s1)
+	return true;
+    }
+
+  if (gimple_uid (gsi_stmt (gsi1)) <= gimple_uid (gsi_stmt (gsi2)))
+    return true;
+  return false;
+}
+
 /* A helper function to free scev and LOOP niter information, as well as
    clear loop constraint LOOP_C_FINITE.  */
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 6c830ad09f4..a2009913d80 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -135,6 +135,10 @@  struct _slp_tree {
   /* Load permutation relative to the stores, NULL if there is no
      permutation.  */
   vec<unsigned> load_permutation;
+  /* Lane permutation of the operands scalar lanes encoded as pairs
+     of { operand number, lane number }.  The number of elements
+     denotes the number of output lanes.  */
+  vec<std::pair<unsigned, unsigned> > lane_permutation;
 
   tree vectype;
   /* Vectorized stmt/s.  */
@@ -151,12 +155,12 @@  struct _slp_tree {
   /* The maximum number of vector elements for the subtree rooted
      at this node.  */
   poly_uint64 max_nunits;
-  /* Whether the scalar computations use two different operators.  */
-  bool two_operators;
   /* The DEF type of this node.  */
   enum vect_def_type def_type;
   /* The number of scalar lanes produced by this node.  */
   unsigned int lanes;
+  /* The operation of this node.  */
+  enum tree_code code;
 };
 
 
@@ -195,11 +199,12 @@  public:
 #define SLP_TREE_VEC_DEFS(S)                     (S)->vec_defs
 #define SLP_TREE_NUMBER_OF_VEC_STMTS(S)          (S)->vec_stmts_size
 #define SLP_TREE_LOAD_PERMUTATION(S)             (S)->load_permutation
-#define SLP_TREE_TWO_OPERATORS(S)		 (S)->two_operators
+#define SLP_TREE_LANE_PERMUTATION(S)             (S)->lane_permutation
 #define SLP_TREE_DEF_TYPE(S)			 (S)->def_type
 #define SLP_TREE_VECTYPE(S)			 (S)->vectype
 #define SLP_TREE_REPRESENTATIVE(S)		 (S)->representative
 #define SLP_TREE_LANES(S)			 (S)->lanes
+#define SLP_TREE_CODE(S)			 (S)->code
 
 /* Key for map that records association between
    scalar conditions and corresponding loop mask, and
@@ -1932,6 +1937,6 @@  void vect_pattern_recog (vec_info *);
 unsigned vectorize_loops (void);
 void vect_free_loop_info_assumptions (class loop *);
 gimple *vect_loop_vectorized_call (class loop *, gcond **cond = NULL);
-
+bool vect_stmt_dominates_stmt_p (gimple *, gimple *);
 
 #endif  /* GCC_TREE_VECTORIZER_H  */