diff mbox

[unified-autovect:,2/N] Implementation of k-arity promotion/reduction

Message ID 38C8F1E431EDD94A82971C543A11B4FEA5364A4B@PUMAIL01.pu.imgtec.org
State New
Headers show

Commit Message

sameera Feb. 6, 2017, 7:42 a.m. UTC
Hi Richard,

Sorry for delayed patch submission. I was on maternity leave, so could not post earlier.
Here is the previous mail for your reference: https://gcc.gnu.org/ml/gcc/2016-06/msg00043.html

Please find attached the patch for stage 2: implementation of k-arity promotion/reduction in the series "Improving effectiveness and generality of autovectorization using unified representation".

The permute nodes within primitive reorder tree(PRT) generated from input program can have any arity depending upon stride of accesses. However, the target cannot have instructions to support all arities. Hence, we need to promote or reduce the arity of PRT to enable successful tree tiling.

In classic autovectorization, if vectorization stride > 2, arity reduction is performed by generating cascaded extract and interleave instructions as described by "Auto-vectorization of Interleaved Data for SIMD" by D. Nuzman, I. Rosen and A. Zaks.  

Moreover, to enable SLP across loop, "Loop-aware SLP in GCC" by D. Nuzman, I. Rosen and A. Zaks unrolls loop till stride = vector size.

k-arity reduction/promotion algorithm makes use of modulo arithmetic to generate PRT of desired arity for both above-mentioned cases.

Single ILV node of arity k can be reduced into cascaded ILV nodes with single node of arity m with children of arity k/m such that ith child of original ILV node becomes floor (i/m) th child of (i%m) th child of new parent.

Single EXTR node with k parts and i selector can be reduced into cascaded EXTR nodes such that parent EXTR node has m parts and i/(k/m) selection on child EXTR node with k/m parts and i % (k/m) selection.

Similarly, loop unrolling to get desired arity m can be represented as arity promotion from k to m.

Single ILV node of arity k can be promoted to single ILV node of arity m by adding extraction with m/k parts and selection i/k of i%k the child of original tree as ith child of new ILV node.

To enable loop-aware SLP, we first promote arity of input PRT to maximum vector size permissible on the architecture. This can have impact on vector code size, though performance will be the same. However, to allow variable vector size like SVE in NEON, it is necessary.

Later we apply arity promotion reduction algorithm on the output tree to get tree with desired arity. For now, we are supporting target arity = 2, as most of the architectures have support for that. However, the code can be extended for additional arity supports as well.

I have tested the code with handwritten testcases for correctness.
Do you spot any problem in the logic or arithmetic that I am performing for reduction/promotion? If not, will push this patch on the branch that we have created - unified-autovect.

- Thanks and regards,
  Sameera D.
diff mbox

Patch

Index: gcc/Makefile.in
===================================================================
--- gcc/Makefile.in	(revision 243687)
+++ gcc/Makefile.in	(working copy)
@@ -1529,6 +1529,7 @@ 
 	tree-vect-slp.o \
 	tree-vectorizer.o \
 	tree-vect-unified.o \
+	tree-vect-unified-opts.o \
 	tree-vrp.o \
 	tree.o \
 	valtrack.o \
Index: gcc/tree-vect-data-refs.c
===================================================================
--- gcc/tree-vect-data-refs.c	(revision 238158)
+++ gcc/tree-vect-data-refs.c	(working copy)
@@ -136,16 +136,9 @@ 
   return scalar_type;
 }
 
-
-/* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
-   tested at run-time.  Return TRUE if DDR was successfully inserted.
-   Return false if versioning is not supported.  */
-
-static bool
-vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
+bool
+vect_mark_for_runtime_alias_test_1 (ddr_p ddr, loop *loop)
 {
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-
   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
     return false;
 
@@ -189,11 +182,28 @@ 
       return false;
     }
 
-  LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
   return true;
 }
 
 
+
+/* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
+   tested at run-time.  Return TRUE if DDR was successfully inserted.
+   Return false if versioning is not supported.  */
+
+static bool
+vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  bool is_alias;
+
+  is_alias = vect_mark_for_runtime_alias_test_1 (ddr, loop);
+  if (is_alias)
+    LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
+  return is_alias;
+}
+
+
 /* Function vect_analyze_data_ref_dependence.
 
    Return TRUE if there (might) exist a dependence between a memory-reference
Index: gcc/tree-vect-unified-opts.c
===================================================================
--- gcc/tree-vect-unified-opts.c	(revision 0)
+++ gcc/tree-vect-unified-opts.c	(working copy)
@@ -0,0 +1,391 @@ 
+/* lOOP Vectorization using unified representation
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Loop autovectorization using unified representation for permute
+   instructions.  */
+#if 1
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "tree.h"
+#include "gimple.h"
+#include "predict.h"
+#include "tree-pass.h"
+#include "ssa.h"
+#include "cgraph.h"
+#include "fold-const.h"
+#include "stor-layout.h"
+#include "gimple-iterator.h"
+#include "gimple-walk.h"
+#include "tree-ssa-loop-manip.h"
+#include "tree-cfg.h"
+#include "cfgloop.h"
+#include "tree-vectorizer.h"
+#include "tree-ssa-propagate.h"
+#include "dbgcnt.h"
+#include "tree-scalar-evolution.h"
+#include "tree-vect-unified.h"
+#include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#include "target.h"
+#include "rtl.h"
+#include "tm_p.h"
+#include "optabs-tree.h"
+#include "dumpfile.h"
+#include "alias.h"
+#include "tree-eh.h"
+#include "gimplify.h"
+#include "gimplify-me.h"
+#include "tree-ssa-loop-ivopts.h"
+#include "tree-ssa-loop.h"
+#include "expr.h"
+#include "builtins.h"
+#include "params.h"
+#include "pretty-print.h"
+#include "pretty-print.h"
+
+/* Function ILV_arity_reduction.
+
+	ILV_k^N					    ILV_m^Nk/m
+      /    |    \        =>                         /   |    \
+     /     |     \                               /      |       \
+    T1    T2 ... Tk                          /          |   ...     \
+	                                 /              |              \
+	                             ILV_k/m^N      ILV_k/m^N      ILV_k/m^N
+	                              /  |  \        /  |  \        /  |  \
+	                             /   |...\      /   |...\      /   |...\
+	                           T1 Tm+1 Tk-m+1  T2 Tm+2 Tk-m+2 Tm  T2m  Tk
+
+*/
+
+struct primop_tree *
+ILV_arity_reduction (struct primop_tree *root, int from_arity, int to_arity)
+{
+  struct primop_tree *new_root, *new_child, *tmp;
+  tree new_iter_count;
+  int i, j;
+
+  gcc_assert (from_arity == PT_DIVISION (root));
+
+  new_iter_count = size_binop (MULT_EXPR,
+			fold_convert (ssizetype, PT_ITER_COUNT (root)),
+			ssize_int (from_arity / to_arity));
+  new_root = create_primTree_combine (POP_ILV, NULL, to_arity,
+	 new_iter_count, PT_PARENT (root));
+
+  for (i = 0; i < to_arity; i++)
+    {
+      new_child = create_primTree_combine (POP_ILV, NULL,
+	 from_arity / to_arity, PT_ITER_COUNT (root), new_root);
+
+      for (j = 0; j < from_arity / to_arity; j++)
+	{
+	  add_child_at_index (new_child, PT_CHILD (root, j * to_arity + i), j);
+	}
+
+      tmp = k_arity_promotion_reduction (new_child, to_arity);
+
+      if (tmp != NULL)
+	{
+	  add_child_at_index (new_root, tmp, i);
+	}
+      else
+	return NULL;
+    }
+
+  return new_root;
+}
+
+/*Function EXTR_arity_reduction.
+
+       EXTR Arity Reduction
+
+	               EXTR_k,i^N                     EXTR_m,(i/(k/m))%m^Nm/k
+	                    |          =>                      |
+	                    |                                  |
+	                    T                         EXTR_k/m,i%(k/m)^N
+	                                                       |
+	                                                       |
+	                                                       T
+*/
+
+struct primop_tree *
+EXTR_arity_reduction (struct primop_tree *root, int from_arity, int to_arity)
+{
+  struct primop_tree *new_root, *new_child, *tmp;
+  tree new_iter_count;
+  int i, j;
+  gcc_assert (from_arity == PT_DIVISION (root));
+  new_iter_count = size_binop (MULT_EXPR,
+			fold_convert (ssizetype, PT_ITER_COUNT (root)),
+			ssize_int (to_arity / from_arity));
+  new_root = create_primTree_partition (POP_EXTR, NULL, to_arity,
+		(PT_OPERAND_SELECTOR (root) * to_arity / from_arity) % to_arity,
+		new_iter_count, PT_PARENT (root));
+
+  new_child = create_primTree_partition (POP_EXTR, NULL, from_arity/to_arity,
+		PT_OPERAND_SELECTOR (root) % (from_arity / to_arity),
+		PT_ITER_COUNT (root), new_root);
+
+  add_child_at_index (new_child, PT_CHILD (root, 0), 0);
+
+  tmp = k_arity_promotion_reduction (new_child, to_arity);
+
+  if (tmp != NULL)
+    {
+      add_child_at_index (new_root, tmp, 0);
+    }
+  else
+    return NULL;
+
+
+  return new_root;
+}
+
+/* Function k_arity_reduction.
+
+   Driver function for arity reduction of EXTR, ILV, CONCAT and SPLT.  CONCAT
+   and SPLT need not be implemented due to phase ordering.  */
+
+struct primop_tree *
+k_arity_reduction (struct primop_tree *root, int from_arity, int to_arity)
+{
+  if (PT_NODE_OP (root) == POP_ILV)
+    return ILV_arity_reduction (root, from_arity, to_arity);
+
+  if (PT_NODE_OP (root) == POP_EXTR)
+    return EXTR_arity_reduction (root, from_arity, to_arity);
+
+  /* For leaf nodes - like const or memory, return root.  */
+  if (PT_ARITY (root) == 0)
+    return root;
+
+  return NULL;
+}
+
+/* Function ILV_arity_promotion.
+
+		ILV_k^N
+		 / | \
+		/  |  \
+	       /   |   \
+	      T1  T2   Tk
+
+	         _||_
+	 	 \  /
+	          \/
+
+	                   	  ILV_m^Nk/m
+	   +-----------------+-----------+-+---------------------+
+	  /                 /           /   \                     \
+       /                 /           /        \                     \
+    /                 /           /             \                     \
+EXTR_m/k^N,0 ... EXTR_m/k^N,0 EXTR_m/k^N,1 ... EXTR_m/k^N,m/k-1 ... EXTR_m/k^N,m/k-1
+     |                  |          |                   |                     |
+    T1                  Tk         T1                  T1                    Tk
+
+*/
+
+struct primop_tree *
+ILV_arity_promotion (struct primop_tree *root, int from_arity, int to_arity)
+{
+  struct primop_tree *new_root, *new_child, *tmp;
+  tree new_iter_count;
+  int i, j;
+
+  gcc_assert (from_arity == PT_DIVISION (root));
+  new_iter_count = size_binop (EXACT_DIV_EXPR, size_binop (MULT_EXPR,
+						 fold_convert (ssizetype,
+							 PT_ITER_COUNT (root)),
+						 ssize_int (from_arity)),
+			       ssize_int (to_arity));
+  new_root = create_primTree_combine (POP_ILV, NULL, to_arity,
+		new_iter_count, PT_PARENT (root));
+  for (i = 0; i < to_arity / from_arity; i++)
+    {
+      for (j = 0; j < from_arity; j++)
+	{
+	  new_child = create_primTree_partition (POP_EXTR, NULL,
+			 to_arity / from_arity, i, PT_ITER_COUNT (root),
+			 new_root);
+	  add_child_at_index (new_child, PT_CHILD (root, j), 0);
+	  tmp = k_arity_promotion_reduction (new_child, to_arity);
+	  if (tmp != NULL)
+	    add_child_at_index (new_root, tmp,  i * from_arity + j);
+	  else
+	    return NULL;
+	}
+    }
+
+  return new_root;
+}
+
+/* Function merge_EXTR_nodes.
+
+   Multiple EXTR nodes can be there in primop_tree, which need to be merged,
+   before promoting/reducing EXTR node for better optimization.  If merged EXTR
+   node needs further promotion, the vectorization cannot be applied, as EXTR
+   promotion can introduce new ILV node.
+  TODO - Add stricter check for final promotion/reduction to target specific
+  arity.  Currently, we do not block vectorization if EXTR arity cannot be
+  promoted.
+*/
+
+struct primop_tree *
+merge_EXTR_nodes (struct primop_tree *root, int to_arity)
+{
+  int from_arity, parts, selector;
+  struct primop_tree *iter_node, *tmp;
+  tree iter_count;
+
+  from_arity = PT_DIVISION (root);
+  parts = 1;
+  iter_node = root;
+  selector = 0;
+
+  while (PT_NODE_OP (iter_node) == POP_EXTR)
+    {
+      iter_count = PT_ITER_COUNT (iter_node);
+      parts = parts * PT_DIVISION (iter_node);
+      selector = selector * PT_DIVISION (iter_node)
+			    + PT_OPERAND_SELECTOR (iter_node);
+      iter_node = PT_CHILD (iter_node, 0);
+    }
+
+  if (iter_node != PT_CHILD (root, 0))
+    {
+      tmp = create_primTree_partition (POP_EXTR, NULL, parts, selector,
+			     	       iter_count, PT_PARENT (root));
+      add_child_at_index (tmp, iter_node, 0);
+      return tmp;
+    }
+
+  return root;
+}
+
+/* Function k_arity_promotion.
+
+   Driver function for arity promotion of EXTR, ILV, CONCAT and SPLT.  CONCAT
+   and SPLIT need not be implemented due to phase ordering.  Implementation of
+   ILV-promotion introduces new EXTR nodes whereas implementation of
+   EXTR-promotion introduce ILV node.  To avoid the infinite loop, only
+   ILV-promotion or EXTR-promotion can be implemented.  As the
+   k_arity_promotion_reduction is top-down algorithm, ILV-promotion is
+   implemented.  For EXTR-promotion, merge newly created EXPR node with child
+   EXTR node if any, and hope that arity promotion/reduction is applicable
+   to new EXTR node.  */
+
+struct primop_tree *
+k_arity_promotion (struct primop_tree *root, int from_arity, int to_arity)
+{
+  struct primop_tree *tmp_root;
+
+  if (PT_NODE_OP (root) == POP_ILV)
+    return ILV_arity_promotion (root, from_arity, to_arity);
+
+  if (PT_NODE_OP (root) == POP_EXTR)
+    {
+      tmp_root = merge_EXTR_nodes (root, to_arity);
+      if (tmp_root != root)
+	return k_arity_promotion_reduction (tmp_root, to_arity);
+      else
+	return root;
+    }
+/* For leaf nodes - like const or memory, return root.  */
+  if (PT_ARITY (root) == 0)
+    return root;
+
+  return NULL;
+}
+
+/* Function k_arity_promotion_reduction.
+
+   Driver function for promoting/reducing arity of the tree rooted at ROOT from
+   FROM_ARITY to TO_ARITY.  */
+
+struct primop_tree *
+k_arity_promotion_reduction (struct primop_tree *root, int to_arity)
+{
+  struct primop_tree *retval = root;
+  int from_arity, i;
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+			"\n k_arity_promotion_reduction: ");
+      dump_primtree_node (MSG_NOTE, root);
+    }
+
+  if (PT_NODE_OP (root) == POP_EXTR || PT_NODE_OP (root) == POP_SPLT)
+    from_arity = PT_DIVISION (root);
+  else
+    from_arity = PT_ARITY (root);
+
+  if (PT_NODE_OP (root) >= MAX_TREE_CODES && PT_NODE_OP (root) < POP_COLLAPSE)
+    {
+      if (from_arity > to_arity)
+	{
+	  /* Arity reduction.  */
+	  if (from_arity % to_arity == 0)
+    	    {
+    	      retval = k_arity_reduction (root, from_arity, to_arity);
+	      return retval;
+    	    }
+	  else
+    	    return NULL;
+	}
+      else if (from_arity < to_arity)
+	{
+	  /* Arity promotion.  */
+	  if (to_arity % from_arity == 0)
+    	    {
+    	      retval = k_arity_promotion (root, from_arity, to_arity);
+	      return retval;
+    	    }
+	  else
+    	    return NULL;
+	}
+      else
+	{
+	  retval = duplicate_prim_node (root);
+	  //return retval;
+	}
+    }
+
+  if (retval != NULL)
+    {
+      /* The tree node is compute-node.  Hence, no action to be taken for arity
+	 promotion/reduction.  However, the subtrees below this root may need
+	 arity adjustment.  Hence, invoke k_arity_promotion_reduction algorithm
+	 recursively on children of root.  */
+      for (i = 0; i < retval->children.length (); i++)
+	{
+	  struct primop_tree *tmp;
+	  tmp = k_arity_promotion_reduction (PT_CHILD (retval, i), to_arity);
+	  if (tmp == NULL)
+	    return NULL;
+
+	  PT_CHILD (retval, i) = tmp;
+	}
+
+      PT_ARITY (retval) = i;
+    }
+
+  return retval;
+
+
+}
+
+#endif
Index: gcc/tree-vect-unified.c
===================================================================
--- gcc/tree-vect-unified.c	(revision 243687)
+++ gcc/tree-vect-unified.c	(working copy)
@@ -1,4 +1,10 @@ 
-/* lOOP Vectorization using unified representation
+/* lOOP Vectorization using unified representation for permute instructions.
+   Copyright (C) 2003-2015 Free Software Foundation, Inc.
+   Contributed by Sameera Deshpande <sameera.deshpande@imgtec.com>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free
 Software Foundation; either version 3, or (at your option) any later
 version.
@@ -102,6 +108,26 @@ 
 
 } // anon namespace
 
+#define DEFTREECODE(SYM, NAME, TYPE, LEN) NAME,
+#define END_OF_BASE_TREE_CODES "@dummy",
+
+static const char *const tree_code_name[] = {
+#include "all-tree.def"
+"ILV",
+"CONCAT",
+"EXTR",
+"SPLT",
+"COLLAPSE",
+"MEMREF",
+"CONST",
+"INVAR",
+"ITER"
+};
+
+#undef DEFTREECODE
+#undef END_OF_BASE_TREE_CODES
+
+
 gimple_opt_pass *
 make_pass_unified_vectorize (gcc::context *ctxt)
 {
@@ -108,6 +134,53 @@ 
   return new pass_unified_vectorize (ctxt);
 }
 
+
+vec<struct stmt_attr *> stmt_attr_vec;
+
+void
+init_stmt_attr_vec (void)
+{
+  gcc_assert (!stmt_attr_vec.exists ());
+  stmt_attr_vec.create (50);
+}
+
+void
+free_stmt_attr_vec (void)
+{
+  gcc_assert (stmt_attr_vec.exists ());
+  stmt_attr_vec.release ();
+}
+
+inline void
+set_stmt_attr (gimple *stmt, struct stmt_attr *info)
+{
+  unsigned int uid = gimple_uid (stmt);
+  if (uid == 0)
+    {
+      gcc_checking_assert (info);
+      uid = stmt_attr_vec.length () + 1;
+      gimple_set_uid (stmt, uid);
+      stmt_attr_vec.safe_push (info);
+    }
+  else
+    {
+      gcc_checking_assert (info == NULL);
+      stmt_attr_vec[uid - 1] = info;
+    }
+}
+
+inline struct stmt_attr *
+get_stmt_attr (gimple *stmt)
+{
+  unsigned int uid = gimple_uid (stmt);
+  if (uid == 0)
+    return NULL;
+
+  return stmt_attr_vec[uid - 1];
+}
+
+
+
 /* Function new_iter_node.
 
    Create new ITER_node for the loop LOOP, and return the pointer.  */
@@ -1498,7 +1571,7 @@ 
 */
 
 struct primop_tree *
-populate_prim_node (enum primop_code pcode, struct ITER_node *inode,
+populate_prim_node (enum primop_code pcode, tree iter_count,
 		    struct primop_tree *parent, gimple *stmt)
 {
   struct primop_tree *ptree;
@@ -1506,7 +1579,7 @@ 
 
   PT_NODE_OP (ptree) = (int) pcode;
   PT_PARENT (ptree) = parent;
-  PT_ITER_COUNT (ptree) = ITER_NODE_NITERS (inode);
+  PT_ITER_COUNT (ptree) = iter_count;
 
   if (stmt)
     {
@@ -1566,11 +1639,11 @@ 
 
 struct primop_tree *
 create_primTree_memref (tree base, tree step, bool is_read, int num,
-                        struct ITER_node *inode, struct primop_tree *parent)
+                        tree iter_count, struct primop_tree *parent)
 {
   struct primop_tree * ptree;
 
-  ptree = populate_prim_node (POP_MEMREF, inode, parent, NULL);
+  ptree = populate_prim_node (POP_MEMREF, iter_count, parent, NULL);
 
   PT_MEMVAL_BASE (ptree) = unshare_expr (base);
   PT_MEMVAL_MULT_IDX (ptree) = unshare_expr (step);
@@ -1579,8 +1652,8 @@ 
   if (dump_enabled_p ())
     {
       dump_printf_loc (MSG_NOTE, vect_location,
-		       " create_primTree_memref : stride - %d\n ",
-		       tree_to_uhwi (step) / num);
+		       " create_primTree_memref %d : stride - %d\n ",
+		       PT_PID (ptree), tree_to_uhwi (step) / num);
      }
   return ptree;
 }
@@ -1591,11 +1664,11 @@ 
    which primtree is being created.  */
 struct primop_tree *
 create_primTree_combine (enum primop_code pcode, gimple *stmt, int parts,
-			 struct ITER_node *inode, struct primop_tree *parent)
+			 tree iter_count, struct primop_tree *parent)
 {
   struct primop_tree * ptree;
 
-  ptree = populate_prim_node (pcode, inode, parent, stmt);
+  ptree = populate_prim_node (pcode, iter_count, parent, stmt);
   PT_OPERAND_SELECTOR (ptree) = -1;
   PT_DIVISION (ptree) = parts;
   PT_VAR_STRIDE (ptree) = NULL;
@@ -1602,7 +1675,8 @@ 
   if (dump_enabled_p ())
     {
       dump_printf_loc (MSG_NOTE, vect_location,
-		       " create_primTree_combine: parts - %d\n", parts);
+		       " create_primTree_combine %d : parts - %d\n",
+			PT_PID (ptree), parts);
     }
 
   return ptree;
@@ -1615,12 +1689,12 @@ 
    SELECTOR is the part being selected.  */
 struct primop_tree *
 create_primTree_partition (enum primop_code pcode, gimple *stmt, int parts,
-			   int selector, struct ITER_node *inode,
+			   int selector, tree iter_count,
 			   struct primop_tree *parent)
 {
   struct primop_tree * ptree;
 
-  ptree = populate_prim_node (pcode, inode, parent, stmt);
+  ptree = populate_prim_node (pcode, iter_count, parent, stmt);
   PT_OPERAND_SELECTOR (ptree) = selector;
   PT_DIVISION (ptree) = parts;
   PT_VAR_STRIDE (ptree) = NULL;
@@ -1628,8 +1702,8 @@ 
   if (dump_enabled_p ())
     {
       dump_printf_loc (MSG_NOTE, vect_location,
-		   " create_primTree_partition : parts - %d selector - %d\n",
-		   parts, selector);
+		   " create_primTree_partition %d : parts - %d selector - %d\n",
+		   PT_PID (ptree), parts, selector);
     }
 
   return ptree;
@@ -1660,7 +1734,46 @@ 
   return PT_CHILD (ptree, idx);
 }
 
+/* Function duplicate_prim_node.
 
+   This function copies contents of SRC node into new node, and returns pointer
+   to it.
+*/
+
+struct primop_tree *
+duplicate_prim_node (struct primop_tree *src)
+{
+  struct primop_tree *ptree;
+  int i;
+
+  ptree = init_primop_node ();
+
+  PT_NODE_OP (ptree) = PT_NODE_OP (src);
+  PT_PARENT (ptree) = PT_PARENT (src);
+  PT_ITER_COUNT (ptree) = PT_ITER_COUNT (src);
+  PT_VEC_SIZE (ptree) = PT_VEC_SIZE (src);
+  PT_VEC_TYPE (ptree) = PT_VEC_TYPE (src);
+
+  PT_ATTR_NO (ptree) = PT_ATTR_NO (src);
+
+  for (i = 0; i < src->children.length (); i++)
+    {
+      add_child_at_index (ptree, PT_CHILD (src, i), i);
+    }
+
+  memcpy (&ptree->u, &src->u, sizeof (ptree->u));
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+		   " duplicate_prim_node : pid - %d node_op - %s\n",
+		   PT_PID (src), tree_code_name[PT_NODE_OP (ptree)]);
+    }
+
+  return ptree;
+}
+
+
 /* Function vectorizable_store.
 
    This function checks if STMT is STORE instruction and is vectorizable for
@@ -1808,10 +1921,11 @@ 
   pnode = exists_primTree_with_memref (base, step, false, inode);
   if (pnode == NULL)
     {
-      pnode = create_primTree_memref (base, step, false, num, inode, NULL);
-      ITER_NODE_LOOP_BODY (inode).safe_push (pnode);
+      pnode = create_primTree_memref (base, step, false, num, ITER_NODE_NITERS (inode), NULL);
+      ITER_NODE_LOOP_BODY (inode).safe_insert (ITER_NODE_LOOP_BODY (inode).length (),
+			pnode);
       pchild1 =  create_primTree_combine (POP_ILV, stmt,
-			tree_to_uhwi (step) / num, inode, pnode);
+			tree_to_uhwi (step) / num, ITER_NODE_NITERS (inode), pnode);
       add_child_at_index (pnode, pchild1, 0);
     }
   else
@@ -1922,8 +2036,8 @@ 
 
   pnode = create_primTree_partition (POP_EXTR, stmt,
 		tree_to_uhwi (step) / num,
-		tree_to_uhwi (offset) / num, inode, parent);
-  pchild1 = create_primTree_memref (base, step, true, num, inode, pnode);
+		tree_to_uhwi (offset) / num, ITER_NODE_NITERS (inode), parent);
+  pchild1 = create_primTree_memref (base, step, true, num, ITER_NODE_NITERS (inode), pnode);
   add_child_at_index (pnode, pchild1, 0);
   return pnode;
 }
@@ -2067,7 +2181,7 @@ 
 static bool
 create_ptree (struct ITER_node *inode)
 {
-  auto_vec<gimple *, 64> worklist;
+  vec<gimple *> worklist;
   bool is_ok;
 
   mark_probable_root_nodes (inode);
@@ -2230,6 +2344,54 @@ 
   return create_ptree (inode);
 }
 
+void
+print_primtree (FILE *fp, struct primop_tree *t)
+{
+  switch (PT_NODE_OP (t))
+    {
+      case POP_ILV:
+      case POP_CONCAT:
+	dump_printf (MSG_NOTE, "%s:%d_%d\n", tree_code_name[PT_NODE_OP (t)],
+			PT_PID (t), PT_DIVISION (t));
+	break;
+
+      case POP_EXTR:
+      case POP_SPLT:
+	dump_printf (MSG_NOTE, "%s:%d_%d,%d\n", tree_code_name[PT_NODE_OP (t)],
+			PT_PID (t), PT_DIVISION (t), PT_OPERAND_SELECTOR (t));
+	break;
+
+      case POP_MEMREF:
+	dump_printf (MSG_NOTE, "%s:%d\n", tree_code_name[PT_NODE_OP (t)],
+			PT_PID (t));
+	print_generic_expr (fp, PT_MEMVAL_BASE (t), TDF_SLIM);
+	break;
+
+      case POP_CONST:
+      case POP_INV:
+      case POP_COLLAPSE:
+    	break;
+
+      case POP_ITER:
+	break;
+
+      default:
+        dump_gimple_stmt (MSG_NOTE, TDF_SLIM, PT_COMPUTE_STMT (t), 0);
+	break;
+
+    }
+}
+
+void
+dump_primtree_node (int dump_kind, struct primop_tree *t)
+{
+    if (dump_file)
+      print_primtree (dump_file, t);
+
+    else if (alt_dump_file)
+      print_primtree (alt_dump_file, t);
+}
+
 /* Function pretty_print_gimple_vec.
 
    Function to pretty print all the gimple statements in LIST.  */
@@ -2250,25 +2412,6 @@ 
     }
 }
 
-#define DEFTREECODE(SYM, NAME, TYPE, LEN) NAME,
-#define END_OF_BASE_TREE_CODES "@dummy",
-
-static const char *const tree_code_name[] = {
-#include "all-tree.def"
-"ILV",
-"CONCAT",
-"EXTR",
-"SPLT",
-"COLLAPSE",
-"MEMREF",
-"CONST",
-"INVAR",
-"ITER"
-};
-
-#undef DEFTREECODE
-#undef END_OF_BASE_TREE_CODES
-
 /* Function pp_primop_tree.
 
    Function to pretty print primop_tree PTREE.  */
@@ -2276,6 +2419,7 @@ 
 void
 pp_primop_tree (pretty_printer *pp, struct primop_tree * ptree)
 {
+  int i;
   pp_newline_and_indent (pp, 0);
   pp_indent (pp);
   pp_printf (pp,"node [shape=record];");
@@ -2330,12 +2474,13 @@ 
 
       worklist = ptree->children.copy ();
 
-      while (worklist.length () > 0)
+      for (i = 0; i < worklist.length (); i++)
 	{
-	  struct primop_tree *child = worklist.pop ();
+	  struct primop_tree *child;
+	  worklist.iterate (i, &child);
  	  pp_newline_and_indent (pp, 0);
           pp_indent (pp);
-	  pp_printf (pp, "%d -> %d;", PT_PID (ptree), PT_PID (child));
+	  pp_printf (pp, "%d -> %d [label = %d];", PT_PID (ptree), PT_PID (child), i);
 	}
     }
 
@@ -2349,13 +2494,16 @@ 
 pretty_print_ptree_vec (pretty_printer *pp, vec<struct primop_tree *> list)
 {
   vec<struct primop_tree *> worklist;
+  struct primop_tree *tmp;
+  int i;
 
   worklist = list.copy ();
 
-  while (worklist.length () > 0)
+  for (i = 0; i < worklist.length (); i++)
     {
       pp_newline_and_indent (pp, 0);
-      pp_primop_tree (pp, worklist.pop ());
+      worklist.iterate (i, &tmp);
+      pp_primop_tree (pp, tmp);
     }
 }
 
@@ -2449,6 +2597,8 @@ 
   unsigned int num_vectorized_loops = 0;
   unsigned int ret = 0;
   unsigned int i;
+  unsigned int vector_sizes, max_vec_size;
+
   vect_loops_num = number_of_loops (cfun);
 
   /* Bail out if there are no loops.  */
@@ -2474,6 +2624,9 @@ 
 	/* Vectorization should be possible.  Let us find if all statements are
 	   vectorizable, and if yes, create p-tree.  */
         struct ITER_node * tmp_iter_node;
+	struct primop_tree *tmp_tree;
+	bool failed;
+	vec<struct primop_tree *> worklist;
 
 	vect_location = find_loop_location (loop);
 	if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
@@ -2516,6 +2669,55 @@ 
 	      dump_iter_node (tmp_iter_node, alt_dump_file);
 	  }
 
+	/* To enable best possible instruction selection, the tree should be
+	   promoted to MAX_VEC_SIZE first, and then reduced to arity supported
+	   by architecture.  The macro TARGET_VECTORIZATION_ARITY provides list
+  	   of arities supported by architecture.  */
+
+        vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
+
+	max_vec_size = 1 << floor_log2 (vector_sizes);
+
+	failed =false;
+	i = 0;
+        worklist = vNULL;
+	worklist = (ITER_NODE_LOOP_BODY (tmp_iter_node)).copy ();
+        for (i = 0;  i < worklist.length (); i++)
+	  {
+	    gcc_assert (worklist.iterate (i, &tmp_tree));
+            tmp_tree = k_arity_promotion_reduction (tmp_tree, max_vec_size);
+	    if (tmp_tree == NULL)
+	      {
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "%d_promotion failed.\n", max_vec_size);
+		failed = true;
+		break;
+	      }
+
+	    tmp_tree = k_arity_promotion_reduction (tmp_tree, 2);
+	    if (tmp_tree == NULL)
+	      {
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "arity_promotion_reduction failed.\n");
+		failed = true;
+		break;
+	      }
+
+	    ITER_NODE_LOOP_BODY (tmp_iter_node)[i] = tmp_tree;
+	  }
+
+	if (failed)
+	  continue;
+
+	if (dump_enabled_p ())
+          {
+	    dump_printf (MSG_NOTE, "\nk-arity promotion/reduction applied.\n");
+	    if (dump_file)
+	      dump_iter_node (tmp_iter_node, dump_file);
+	    if (alt_dump_file)
+	      dump_iter_node (tmp_iter_node, alt_dump_file);
+	  }
+
 	gimple *loop_vectorized_call = vect_loop_vectorized_call (loop);
 	/* If the loop is vectorized, set uid of stmts within scalar loop to
 	   0.  This change is needed if transform phase uses this loop info.  */
Index: gcc/tree-vect-unified.h
===================================================================
--- gcc/tree-vect-unified.h	(revision 243687)
+++ gcc/tree-vect-unified.h	(working copy)
@@ -121,50 +121,6 @@ 
 #define STMT_ATTR_DR(s) (get_stmt_attr (s))->dr
 #define STMT_ATTR_VECTYPE(s) (get_stmt_attr (s))->vectype
 
-vec<struct stmt_attr *> stmt_attr_vec;
-
-void
-init_stmt_attr_vec (void)
-{ 
-  gcc_assert (!stmt_attr_vec.exists ());
-  stmt_attr_vec.create (50);
-}
-
-void
-free_stmt_attr_vec (void)
-{ 
-  gcc_assert (stmt_attr_vec.exists ());
-  stmt_attr_vec.release ();
-}
-
-inline void
-set_stmt_attr (gimple *stmt, struct stmt_attr *info)
-{ 
-  unsigned int uid = gimple_uid (stmt);
-  if (uid == 0)
-    { 
-      gcc_checking_assert (info);
-      uid = stmt_attr_vec.length () + 1;
-      gimple_set_uid (stmt, uid);
-      stmt_attr_vec.safe_push (info);
-    }
-  else
-    {
-      gcc_checking_assert (info == NULL);
-      stmt_attr_vec[uid - 1] = info;
-    }
-}
-
-inline struct stmt_attr *
-get_stmt_attr (gimple *stmt)
-{ 
-  unsigned int uid = gimple_uid (stmt);
-  if (uid == 0)
-    return NULL;
-
-  return stmt_attr_vec[uid - 1];
-}
-
 /* PRIMOP_TREE : Memory accesses within a loop have definite repetative pattern
    which can be captured using primitive permute operators which can be used to
    determine desired permute order for the vector computations.  The PRIMOP_TREE
@@ -277,14 +233,6 @@ 
 #define PT_AUX(x) (x)->aux
 
 //struct ITER_node *iter_node;
-
-extern unsigned int vectorize_loops_using_uniop (void);
-extern struct primop_tree * analyze_and_create_ptree (struct primop_tree *,
-		 gimple *, struct ITER_node *);
-extern void pretty_print_ptree_vec (pretty_printer *,
-				    vec<struct primop_tree*>);
-extern void pretty_print_iter_node (pretty_printer *, struct ITER_node *, int);
-
 enum primop_code {
   POP_ILV=MAX_TREE_CODES,
   POP_CONCAT,
@@ -295,3 +243,32 @@ 
   POP_CONST,
   POP_INV,
   POP_ITER};
+
+extern unsigned int vectorize_loops_using_uniop (void);
+extern struct primop_tree * analyze_and_create_ptree (struct primop_tree *,
+		 		gimple *, struct ITER_node *);
+extern void pretty_print_ptree_vec (pretty_printer *,
+				vec<struct primop_tree*>);
+extern void pretty_print_iter_node (pretty_printer *, struct ITER_node *, int);
+extern struct primop_tree * k_arity_promotion_reduction (struct primop_tree *,
+				int);
+extern struct primop_tree * init_primop_node (void);
+extern struct primop_tree * populate_prim_node (enum primop_code, tree,
+				struct primop_tree *, gimple *);
+extern struct primop_tree * exists_primTree_with_memref (tree, tree, bool,
+				struct ITER_node *);
+extern struct primop_tree * create_primTree_memref (tree, tree, bool, int, tree,
+				 struct primop_tree *);
+extern struct primop_tree * create_primTree_combine (enum primop_code, gimple *,
+				 int, tree, struct primop_tree *);
+extern struct primop_tree * create_primTree_partition (enum primop_code,
+				 gimple *, int, int, tree,
+				 struct primop_tree *);
+extern void add_child_at_index (struct primop_tree *, struct primop_tree *, int);
+extern struct primop_tree * get_child_at_index (struct primop_tree *, int);
+extern struct primop_tree * duplicate_prim_node (struct primop_tree *);
+extern void pp_primop_tree (pretty_printer *, struct primop_tree *);
+extern void dump_primtree_node (int, struct primop_tree *);
+
+
+