diff mbox

[Cilkplus] Elemental Function Mangling

Message ID BF230D13CA30DD48930C31D4099330000596FC@FMSMSX102.amr.corp.intel.com
State New
Headers show

Commit Message

Iyer, Balaji V March 12, 2012, 6:21 p.m. UTC
Hello Everyone,
      This patch is for the Cilkplus branch mainly affecting the C compiler. This patch will do the vector function mangling correctly for elemental functions.

Thanking You,

Yours Sincerely,

Balaji V. Iyer.
diff mbox

Patch

diff --git a/gcc/ChangeLog.cilk b/gcc/ChangeLog.cilk
index 1398870..c1b1d71 100644
--- a/gcc/ChangeLog.cilk
+++ b/gcc/ChangeLog.cilk
@@ -1,3 +1,20 @@ 
+2012-03-11  Balaji V. Iyer  <balaji.v.iyer@intel.com>
+
+	* attribs.c (decl_attributes): Concatinated existing attributes with
+	vector attributes.
+	* c-decl.c (bind): Added a check if scope is not null.
+	* elem-function.c (rename_elm_fn): New function.
+	(is_elem_fn): Likewise.
+	(find_processor_code): Likewise.
+	(find_vlength_code): Likewise.
+	(create_processor_attribute): Likewise.
+	(create_optimize_attribute): Likewise.
+	(find_suffix): Likewise.
+	(create_elem_fn_nodes): Likewise.
+	(extract_elem_fn_values): Likewise.
+	(create_elem_vec_fn): Likewise.
+	* passes.c (init_optimization_passes): Added elemental function pass.
+
 2012-03-09  Balaji V. Iyer  <balaji.v.iyer@intel.com>
 
 	* attribs.c (decl_attributes): Added a check for elemental function
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 0da06b3..ada4090 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1453,6 +1453,7 @@  OBJS = \
         cilk.o \
         cilk-low.o \
 	array-notation-common.o \
+	elem-function.o \
 	$(out_object_file) \
 	$(EXTRA_OBJS) \
 	$(host_hook_obj)
@@ -3436,7 +3437,8 @@  lower-subreg.o : lower-subreg.c $(CONFIG_H) $(SYSTEM_H) coretypes.h \
    insn-config.h $(BASIC_BLOCK_H) $(RECOG_H) $(OBSTACK_H) $(BITMAP_H) \
    $(EXPR_H) $(EXCEPT_H) $(REGS_H) $(TREE_PASS_H) $(DF_H) dce.h
 
-
+elem-function.o: elem-function.c $(CONFIG_H) $(SYSTEM_H) $(TREE_H) $(GIMPLE_H) \
+  $(OPTABS_H) $(RECOG_H)
 
 array-notation-common.o: array-notation-common.c $(CONFIG_H) $(SYSTEM_H) \
     $(TREE_H) $(RTL_H) $(OPTABS_H) $(GIMPLE_H) $(RECOG_H) 
diff --git a/gcc/attribs.c b/gcc/attribs.c
index 2ececc4..13c1417 100644
--- a/gcc/attribs.c
+++ b/gcc/attribs.c
@@ -327,6 +327,12 @@  decl_attributes (tree *node, tree attributes, int flags)
 	  if (!is_elem_fn_attribute_p (name))
 	    warning (OPT_Wattributes, "%qE attribute directive ignored",
 		     name);
+	  else
+	    {
+	      returned_attrs = tree_cons (name, args, returned_attrs);
+	      DECL_ATTRIBUTES (*anode) = tree_cons (name, args,
+						    DECL_ATTRIBUTES (*anode));
+	    }
 	  continue;
 	}
       else if (list_length (args) < spec->min_length
diff --git a/gcc/c-decl.c b/gcc/c-decl.c
index 4abf738..6251bee 100644
--- a/gcc/c-decl.c
+++ b/gcc/c-decl.c
@@ -620,7 +620,8 @@  bind (tree name, tree decl, struct c_scope *scope, bool invisible,
   b->shadowed = 0;
   b->decl = decl;
   b->id = name;
-  b->depth = scope->depth;
+  if (scope)
+    b->depth = scope->depth;
   b->invisible = invisible;
   b->nested = nested;
   b->inner_comp = 0;
@@ -629,8 +630,11 @@  bind (tree name, tree decl, struct c_scope *scope, bool invisible,
 
   b->u.type = NULL;
 
-  b->prev = scope->bindings;
-  scope->bindings = b;
+  if (scope)
+    {
+      b->prev = scope->bindings;
+      scope->bindings = b;
+    }
 
   if (decl_jump_unsafe (decl))
     scope->has_jump_unsafe_decl = 1;
@@ -658,9 +662,11 @@  bind (tree name, tree decl, struct c_scope *scope, bool invisible,
   /* Locate the appropriate place in the chain of shadowed decls
      to insert this binding.  Normally, scope == current_scope and
      this does nothing.  */
-  while (*here && (*here)->depth > scope->depth)
-    here = &(*here)->shadowed;
-
+  if (scope)
+    {
+      while (*here && (*here)->depth > scope->depth)
+	here = &(*here)->shadowed;
+    }
   b->shadowed = *here;
   *here = b;
 }
diff --git a/gcc/c-family/ChangeLog.cilk b/gcc/c-family/ChangeLog.cilk
index a4049d0..08d4c2d 100644
--- a/gcc/c-family/ChangeLog.cilk
+++ b/gcc/c-family/ChangeLog.cilk
@@ -1,3 +1,7 @@ 
+2012-03-11  Balaji V. Iyer  <balaji.v.iyer@intel.com>
+
+	* c-common.c (handle_vector_attribute): New function.
+
 2012-01-20  Balaji V. Iyer  <balaji.v.iyer@intel.com>
 
 	* c-common.c (c_define_builtins): Added a call to
diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c
index f84ccb9..47b5c54 100644
--- a/gcc/c-family/c-common.c
+++ b/gcc/c-family/c-common.c
@@ -381,6 +381,7 @@  static tree handle_type_generic_attribute (tree *, tree, tree, int, bool *);
 static tree handle_alloc_size_attribute (tree *, tree, tree, int, bool *);
 static tree handle_target_attribute (tree *, tree, tree, int, bool *);
 static tree handle_optimize_attribute (tree *, tree, tree, int, bool *);
+static tree handle_vector_attribute (tree *, tree, tree, int, bool *);
 static tree ignore_attribute (tree *, tree, tree, int, bool *);
 static tree handle_no_split_stack_attribute (tree *, tree, tree, int, bool *);
 static tree handle_fnspec_attribute (tree *, tree, tree, int, bool *);
@@ -741,6 +742,8 @@  const struct attribute_spec c_common_attribute_table[] =
 			      handle_target_attribute, false },
   { "optimize",               1, -1, true, false, false,
 			      handle_optimize_attribute, false },
+  { "vector",                 1, -1, true, false, false,
+                              handle_vector_attribute, false },
   /* For internal use only.  The leading '*' both prevents its usage in
      source code and signals that it may be overridden by machine tables.  */
   { "*tm regparm",            0, 0, false, true, true,
@@ -8268,6 +8271,22 @@  parse_optimize_options (tree args, bool attr_p)
   return ret;
 }
 
+static tree
+handle_vector_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
+			 tree args ATTRIBUTE_UNUSED,
+			 int ARG_UNUSED (flags), bool *no_add_attrs)
+{
+  tree opt_list;
+  VEC(tree,gc) *opt_vec = NULL;
+  opt_vec = make_tree_vector ();
+  VEC_safe_push (tree, gc, opt_vec, build_string (2, "O3"));
+  opt_list = build_tree_list_vec (opt_vec);
+  release_tree_vector (opt_vec);
+  handle_optimize_attribute (node, get_identifier ("optimize"), opt_list,
+			     flags, no_add_attrs);
+  return NULL_TREE;
+}
+
 /* For handling "optimize" attribute. arguments as in
    struct attribute_spec.handler.  */
 
diff --git a/gcc/elem-function.c b/gcc/elem-function.c
new file mode 100755
index 0000000..a5a7b61
--- /dev/null
+++ b/gcc/elem-function.c
@@ -0,0 +1,594 @@ 
+/* This file is part of the Intel(R) Cilk(TM) Plus support
+   This file contains the functions for Elemental functions.
+   
+   Copyright (C) 2012  Free Software Foundation, Inc.
+   Written by Balaji V. Iyer <balaji.v.iyer@intel.com>,
+   Intel Corporation
+
+   Many Thanks to Karthik Kumar for advice on the basic technique
+   about cloning functions.
+   
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "rtl.h"
+#include "tm_p.h"
+#include "hard-reg-set.h"
+#include "basic-block.h"
+#include "output.h"
+#include "c-family/c-common.h"
+#include "diagnostic.h"
+#include "tree-flow.h"
+#include "tree-dump.h"
+#include "tree-pass.h"
+#include "timevar.h"
+#include "cfgloop.h"
+#include "flags.h"
+#include "tree-inline.h"
+#include "cgraph.h"
+#include "ipa-prop.h"
+#include "opts.h"
+#include "tree-iterator.h"
+#include "toplev.h"
+#include "options.h"
+#include "intl.h"
+#include "vec.h"
+
+#define MAX_VARS 50
+
+enum mask_options {
+  USE_MASK = 12345,
+  USE_NOMASK,
+  USE_BOTH
+};
+
+typedef struct
+{
+  char *proc_type;
+  enum mask_options mask;
+  int vectorlength[MAX_VARS];
+  int no_vlengths;
+  char *uniform_vars[MAX_VARS];
+  int no_uvars;
+  int uniform_location[MAX_VARS]; /* their location in parm list */
+  char *linear_vars[MAX_VARS];
+  int linear_steps[MAX_VARS];
+  int linear_location[MAX_VARS]; /* their location in parm list */
+  int no_lvars;
+  int private_location[MAX_VARS]; /* parm not in uniform or linear list */
+  int no_pvars;
+  char *func_prefix;
+  int total_no_args;
+} elem_fn_info;
+
+static elem_fn_info *extract_elem_fn_values (tree);
+static tree create_optimize_attribute (int);
+static tree create_processor_attribute (elem_fn_info *, tree *);
+
+/* this function will concatinate the suffix to the existing function decl */
+static tree
+rename_elem_fn (tree decl, const char *suffix)
+{
+  int length = 0;
+  const char *fn_name = IDENTIFIER_POINTER (DECL_NAME (decl));
+  char *new_fn_name;
+  tree new_decl = NULL_TREE;
+  
+  if (!suffix || !fn_name)
+    return decl;
+  else
+    new_decl = decl;
+
+  length = strlen (fn_name) + strlen (suffix) + 1;
+  new_fn_name = (char *)xmalloc (length);
+  strcpy (new_fn_name, fn_name);
+  strcat (new_fn_name, suffix);
+
+  DECL_NAME (new_decl) = get_identifier (new_fn_name);
+  return new_decl;
+}
+
+/* this function will check to see if the node is part of an function that
+ * needs to be converted to its vector equivalent. */
+static bool
+is_elem_fn (struct cgraph_node *node)
+{
+  tree fndecl, ii_tree;
+  if (!node)
+    return false;
+
+  fndecl = node->decl;
+  for (ii_tree = DECL_ATTRIBUTES (fndecl); ii_tree;
+       ii_tree = TREE_CHAIN (ii_tree))
+    {
+      tree ii_value = TREE_PURPOSE (ii_tree);
+      if (TREE_CODE (ii_value) == IDENTIFIER_NODE
+	  && !strcmp (IDENTIFIER_POINTER (ii_value), "vector"))
+	return true;
+    }
+
+  /* If we are here, then we didn't find a vector keyword, so it is false */
+  return false;
+}
+
+/* This function will find the appropriate processor code in the function
+ * mangling vector function
+ */
+static char *
+find_processor_code (elem_fn_info *elem_fn_values)
+{
+  if (!elem_fn_values || !elem_fn_values->proc_type)
+    return NULL;
+
+  if (!strcmp (elem_fn_values->proc_type, "pentium_4"))
+    return xstrdup ("B");
+  else if (!strcmp (elem_fn_values->proc_type, "pentium4_sse3"))
+    return xstrdup ("D");
+  else if (!strcmp (elem_fn_values->proc_type, "core2_duo_ssse3"))
+    return xstrdup ("E");
+  else if (!strcmp (elem_fn_values->proc_type, "core2_duo_sse_4_1"))
+    return xstrdup ("F");
+  else if (!strcmp (elem_fn_values->proc_type, "core_i7_sse4_2"))
+    return xstrdup ("H");
+  else
+    gcc_unreachable ();
+
+  return NULL; /* should never get here */
+}
+
+/* this function will return vectorlength, if specified, in string format -OR-
+ * it will give the default vector length for the specified architecture. */
+static char *
+find_vlength_code (elem_fn_info *elem_fn_values)
+{
+  char *vlength_code = (char *) xmalloc (sizeof (char) * 10);
+  if (!elem_fn_values)
+    return NULL;
+
+  memset (vlength_code, 10, 0);
+  
+  if (elem_fn_values->no_vlengths != 0)
+    sprintf(vlength_code,"%d", elem_fn_values->vectorlength[0]);
+  else
+    {
+      if (!strcmp (elem_fn_values->proc_type, "pentium_4"))
+	sprintf(vlength_code,"4");
+      else if (!strcmp (elem_fn_values->proc_type, "pentium4_sse3"))
+	sprintf (vlength_code, "4");
+      else if (!strcmp (elem_fn_values->proc_type, "core2_duo_ssse3"))
+	sprintf (vlength_code, "4");
+      else if (!strcmp (elem_fn_values->proc_type, "core2_duo_sse_4_1"))
+	sprintf (vlength_code, "4");
+      else if (!strcmp (elem_fn_values->proc_type, "core_i7_sse4_2"))
+	sprintf (vlength_code, "4");
+      else
+	gcc_unreachable ();
+    }
+  return vlength_code;
+}
+
+/* This function will create the appropriate __target__ attribute for the
+ * processor */
+static tree
+create_processor_attribute (elem_fn_info *elem_fn_values, tree *opposite_attr)
+{
+  /* you need the opposite attribute for the scalar code part */
+  tree proc_attr, opp_proc_attr;
+  VEC(tree,gc) *proc_vec_list = VEC_alloc (tree, gc, 4);
+  VEC(tree,gc) *opp_proc_vec_list = VEC_alloc (tree, gc, 4);
+  
+  if (!elem_fn_values || !elem_fn_values->proc_type)
+    return NULL_TREE;
+
+  if (!strcmp (elem_fn_values->proc_type, "pentium_4"))
+    {
+      VEC_safe_push (tree, gc, proc_vec_list,
+		     build_string (strlen ("arch=pentium4"), "arch=pentium4"));
+      VEC_safe_push (tree, gc, proc_vec_list,
+		     build_string (strlen ("mmx"), "mmx"));
+      if (opposite_attr)
+	{
+	  VEC_safe_push (tree, gc, opp_proc_vec_list,
+			 build_string (strlen ("no-mmx"), "no-mmx"));
+	  VEC_safe_push (tree, gc, opp_proc_vec_list,
+			 build_string (strlen ("arch=pentium4"),
+				       "arch=pentium4"));
+	}
+    }
+  else if (!strcmp (elem_fn_values->proc_type, "pentium4_sse3"))
+    {
+      VEC_safe_push (tree, gc, proc_vec_list,
+		     build_string (strlen ("arch=pentium4"), "arch=pentium4"));
+      VEC_safe_push (tree, gc, proc_vec_list,
+		     build_string (strlen ("sse3"), "sse3"));
+      if (opposite_attr)
+	{
+	  VEC_safe_push (tree, gc, opp_proc_vec_list,
+			 build_string (strlen ("arch=pentium4"),
+				       "arch=pentium4"));
+	  VEC_safe_push (tree, gc, opp_proc_vec_list,
+			 build_string (strlen ("no-sse3"), "no-sse3"));
+	}
+    }
+  else if (!strcmp (elem_fn_values->proc_type, "core2_duo_ssse3"))
+    {
+      VEC_safe_push (tree, gc, proc_vec_list,
+		     build_string (strlen ("arch=core2"), "arch=core2"));
+      VEC_safe_push (tree, gc, proc_vec_list,
+		     build_string (strlen ("ssse3"), "ssse3"));
+      if (opposite_attr)
+	{
+	  VEC_safe_push (tree, gc, opp_proc_vec_list,
+			 build_string (strlen ("arch=core2"), "arch=core2"));
+	  VEC_safe_push (tree, gc, opp_proc_vec_list,
+			 build_string (strlen ("no-ssse3"), "no-ssse3"));
+	}
+    }
+  else if (!strcmp (elem_fn_values->proc_type, "core2_duo_sse_4_1"))
+    {
+      VEC_safe_push (tree, gc, proc_vec_list,
+		     build_string (strlen ("arch=core2"), "arch=core2"));
+      VEC_safe_push (tree, gc, proc_vec_list,
+		     build_string (strlen ("sse4.1"), "sse4.1"));
+      if (opposite_attr)
+	{
+	  VEC_safe_push (tree, gc, opp_proc_vec_list,
+			 build_string (strlen ("arch=core2"), "arch=core2"));
+	  VEC_safe_push (tree, gc, opp_proc_vec_list,
+			 build_string (strlen ("no-sse4.1"), "no-sse4.1"));
+	}
+    }
+  else if (!strcmp (elem_fn_values->proc_type, "core_i7_sse4_2"))
+    {
+      VEC_safe_push (tree, gc, proc_vec_list,
+		     build_string (strlen ("arch=corei7"), "arch=corei7"));
+      VEC_safe_push (tree, gc, proc_vec_list,
+		     build_string (strlen ("sse4.2"), "sse4.2"));
+      if (opposite_attr)
+	{
+	  VEC_safe_push (tree, gc, opp_proc_vec_list,
+			 build_string (strlen ("arch=corei7"), "arch=corei7"));
+	  VEC_safe_push (tree, gc, opp_proc_vec_list,
+			 build_string (strlen ("no-sse4.2"), "no-sse4.2"));
+	}
+    }
+  else
+    sorry ("Processor type not supported.");
+
+  proc_attr = build_tree_list_vec (proc_vec_list);
+  VEC_truncate (tree, proc_vec_list, 0);
+  proc_attr = build_tree_list (get_identifier ("__target__"), proc_attr);
+
+  if (opposite_attr)
+    {
+      opp_proc_attr = build_tree_list_vec (opp_proc_vec_list);
+      VEC_truncate (tree, opp_proc_vec_list, 0);
+      opp_proc_attr = build_tree_list (get_identifier ("__target__"),
+				       opp_proc_attr);
+      *opposite_attr = opp_proc_attr;
+    }
+  return proc_attr;
+}
+
+/* this will create an optimize attribute for the vector function, to make sure
+ * the vectorizer is turned on and has its full capabilities */
+static tree
+create_optimize_attribute (int option)
+{
+  tree opt_attr;
+  VEC(tree,gc) *opt_vec = VEC_alloc (tree,gc, 4);
+  char optimization[2];
+  optimization[0] = 'O';
+  sprintf(&optimization[1], "%1d", option);
+  VEC_safe_push (tree, gc, opt_vec, build_string (2, optimization));
+  opt_attr = build_tree_list_vec (opt_vec);
+  VEC_truncate (tree, opt_vec, 0);
+  opt_attr = build_tree_list (get_identifier ("optimize"), opt_attr);
+  return opt_attr;
+}
+  
+/* this function will find the appropriate mangling suffix for the vector
+ * function */
+static char *
+find_suffix (elem_fn_info *elem_fn_values, bool masked)
+{
+  char *suffix = (char*)xmalloc (100);
+  char tmp_str[10];
+  int arg_number, ii_pvar, ii_uvar, ii_lvar;
+  strcpy (suffix, "._simdsimd_");
+  strcat (suffix, find_processor_code (elem_fn_values));
+  strcat (suffix, find_vlength_code (elem_fn_values));
+  if (masked)
+    strcpy (suffix, "m");
+  else
+    strcat (suffix, "n");
+
+  for (arg_number = 1; arg_number <= elem_fn_values->total_no_args;
+       arg_number++)
+    {
+      for (ii_lvar = 0; ii_lvar < elem_fn_values->no_lvars; ii_lvar++)
+	{
+	  if (elem_fn_values->linear_location[ii_lvar] == arg_number)
+	    {
+	      strcat (suffix, "_l");
+	      sprintf(tmp_str, "%d", elem_fn_values->linear_steps[ii_lvar]);
+	      strcat (suffix, tmp_str);
+	    }
+	}
+      for (ii_uvar = 0; ii_uvar < elem_fn_values->no_uvars; ii_uvar++)
+	{
+	  if (elem_fn_values->uniform_location[ii_uvar] == arg_number)
+	    strcat (suffix, "_s1");
+	}
+      for (ii_pvar = 0; ii_pvar < elem_fn_values->no_pvars; ii_pvar++)
+	{
+	  if (elem_fn_values->private_location[ii_pvar] == arg_number)
+	    strcat (suffix, "_v1");
+	}
+    } 
+  return suffix;
+}
+
+/* this function wil create the elemental vector function node */
+static struct cgraph_node *
+create_elem_fn_nodes (struct cgraph_node *node)
+{
+  tree new_decl, old_decl, new_decl_name, opt_attr;
+  tree proc_attr, opp_proc_attr = NULL_TREE;
+  struct cgraph_node *new_node;
+  elem_fn_info *elem_fn_values = NULL;
+  char *suffix = NULL;
+  
+  old_decl = node->decl;
+  new_decl = copy_node (old_decl);
+  elem_fn_values = extract_elem_fn_values (old_decl);
+
+  if (elem_fn_values)
+    {
+      suffix = find_suffix (elem_fn_values, false);
+    }
+  else
+    return NULL;
+  
+  new_decl_name = rename_elem_fn (new_decl, suffix);
+
+  SET_DECL_ASSEMBLER_NAME (new_decl, DECL_NAME(new_decl_name));
+  SET_DECL_RTL (new_decl, NULL);
+  TREE_SYMBOL_REFERENCED (DECL_NAME (new_decl_name)) = 1;
+  
+  new_node = cgraph_copy_node_for_versioning (node, new_decl, NULL, NULL);
+  new_node->local.externally_visible = node->local.externally_visible;
+  new_node->lowered = true;
+
+  tree_function_versioning (old_decl, new_decl, NULL, false, NULL, false, NULL,
+			    NULL);
+  cgraph_call_function_insertion_hooks (new_node);
+  DECL_STRUCT_FUNCTION (new_decl)->elem_fn_already_cloned = true;
+  DECL_STRUCT_FUNCTION (new_decl)->curr_properties = cfun->curr_properties;
+  DECL_ATTRIBUTES (cfun->decl) =
+    remove_attribute ("vector", DECL_ATTRIBUTES (cfun->decl));
+  DECL_ATTRIBUTES (new_node->decl) =
+    remove_attribute ("vector", DECL_ATTRIBUTES (new_node->decl));
+
+  proc_attr = create_processor_attribute (elem_fn_values, &opp_proc_attr);
+  
+  if (proc_attr)
+    decl_attributes (&new_node->decl, proc_attr, 0);
+  if (opp_proc_attr)
+    decl_attributes (&cfun->decl, opp_proc_attr, 0);
+
+  opt_attr = create_optimize_attribute (3); /* this will turn vectorizer on */
+  if (opt_attr)
+    decl_attributes (&new_node->decl, opt_attr, 0);
+  
+  return new_node;
+}
+
+/* This function will extact the vector attribute and store the data in the
+ * elem_fn_info structure.
+ */
+static elem_fn_info *
+extract_elem_fn_values (tree decl)
+{
+  elem_fn_info *elem_fn_values = NULL;
+  int x = 0; /* this is a dummy variable */
+  int arg_number = 0, ii = 0;
+  tree ii_tree, jj_tree, kk_tree;
+  tree decl_attr = DECL_ATTRIBUTES (decl);
+  
+  if (!decl_attr)
+    return NULL;
+
+  elem_fn_values = (elem_fn_info *)xmalloc (sizeof (elem_fn_info));
+  gcc_assert (elem_fn_values);
+
+  elem_fn_values->mask = USE_BOTH;
+  elem_fn_values->no_vlengths = 0;
+  elem_fn_values->no_uvars = 0;
+  elem_fn_values->no_lvars = 0;
+  
+
+  for (ii_tree = decl_attr; ii_tree; ii_tree = TREE_CHAIN (ii_tree))
+    {
+      tree ii_purpose = TREE_PURPOSE (ii_tree);
+      tree ii_value = TREE_VALUE (ii_tree);
+      if (TREE_CODE (ii_purpose) == IDENTIFIER_NODE
+	  && !strcmp (IDENTIFIER_POINTER (ii_purpose), "vector"))
+	{
+	  for (jj_tree = ii_value; jj_tree;
+	       jj_tree = TREE_CHAIN (jj_tree))
+	    {
+	      tree jj_value = TREE_VALUE (jj_tree);
+	      tree jj_purpose = TREE_PURPOSE (jj_value);
+	      if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
+		  && !strcmp (IDENTIFIER_POINTER (jj_purpose), "processor"))
+		{
+		  for (kk_tree = TREE_VALUE (jj_value); kk_tree;
+		       kk_tree = TREE_CHAIN (kk_tree))
+		    {
+		      tree kk_value = TREE_VALUE (kk_tree);
+		      if (TREE_CODE (kk_value) == STRING_CST)
+			elem_fn_values->proc_type =
+			  xstrdup (TREE_STRING_POINTER (kk_value));
+		    }
+		}
+	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
+		       && !strcmp (IDENTIFIER_POINTER (jj_purpose),
+				  "vectorlength"))
+		{
+		  for (kk_tree = TREE_VALUE (jj_value); kk_tree;
+		       kk_tree = TREE_CHAIN (kk_tree))
+		    {
+		      tree kk_value = TREE_VALUE (kk_tree);
+		      if (TREE_CODE (kk_value) == INTEGER_CST)
+			{
+			  x = elem_fn_values->no_vlengths;
+			  elem_fn_values->vectorlength[x] =
+			    (int) TREE_INT_CST_LOW (kk_value);
+			  elem_fn_values->no_vlengths++;
+			}
+		    }
+		}
+	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
+		       && !strcmp (IDENTIFIER_POINTER (jj_purpose), "uniform"))
+		{
+		  for (kk_tree = TREE_VALUE (jj_value); kk_tree;
+		       kk_tree = TREE_CHAIN (kk_tree))
+		    {
+		      tree kk_value = TREE_VALUE (kk_tree);
+		      elem_fn_values->uniform_vars[elem_fn_values->no_uvars] =
+			xstrdup (TREE_STRING_POINTER (kk_value));
+		      elem_fn_values->no_uvars++;
+		    }
+		}
+	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
+		       && !strcmp (IDENTIFIER_POINTER (jj_purpose), "linear"))
+		{
+		  for (kk_tree = TREE_VALUE (jj_value); kk_tree;
+		       kk_tree = TREE_CHAIN (kk_tree))
+		    {
+		      tree kk_value = TREE_VALUE (kk_tree);
+		      elem_fn_values->linear_vars[elem_fn_values->no_lvars] =
+			xstrdup (TREE_STRING_POINTER (kk_value));
+		      kk_tree = TREE_CHAIN (kk_tree);
+		      kk_value = TREE_VALUE (kk_tree);
+		      elem_fn_values->linear_steps[elem_fn_values->no_lvars] =
+			TREE_INT_CST_LOW (kk_value);
+		      elem_fn_values->no_lvars++;
+		    }
+		}
+	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
+		       && !strcmp (IDENTIFIER_POINTER (jj_purpose), "mask"))
+		elem_fn_values->mask = USE_MASK;
+	      else if (TREE_CODE (jj_purpose) == IDENTIFIER_NODE
+		       && !strcmp (IDENTIFIER_POINTER (jj_purpose), "nomask"))
+		elem_fn_values->mask = USE_NOMASK;
+	    }
+	}
+    }
+
+  for (ii_tree = DECL_ARGUMENTS (decl); ii_tree; ii_tree = DECL_CHAIN (ii_tree))
+    {
+      arg_number++;
+      bool already_found = false;
+      for (ii = 0; ii < elem_fn_values->no_uvars; ii++)
+	{
+	  if (DECL_NAME (ii_tree)
+	      && !strcmp (IDENTIFIER_POINTER (DECL_NAME (ii_tree)),
+			  elem_fn_values->uniform_vars[ii]))
+	    {
+	      already_found = true;
+	      elem_fn_values->uniform_location[ii] = arg_number;
+	    }
+	}
+      for (ii = 0; ii < elem_fn_values->no_lvars; ii++)
+	{
+	  if (DECL_NAME (ii_tree)
+	      && !strcmp (IDENTIFIER_POINTER (DECL_NAME (ii_tree)),
+			  elem_fn_values->linear_vars[ii]))
+	    {
+	      if (already_found)
+		  fatal_error
+		    ("variable %s defined in both uniform and linear clause",
+		     elem_fn_values->linear_vars[ii]);
+	      else
+		{
+		  already_found = true;
+		  elem_fn_values->linear_location[ii] = arg_number;
+		}
+	    }
+	}
+      if (!already_found) /* this means this variable is a private */
+	elem_fn_values->private_location[elem_fn_values->no_pvars++] =
+	  arg_number;
+    }
+
+  elem_fn_values->total_no_args = arg_number;
+  
+  return elem_fn_values;
+}  
+
+/* Entry point function for creating the vector elemental function */
+static unsigned int
+create_elem_vec_fn (void)
+{
+  struct cgraph_node *ii_node, *copied_node;
+  
+  for (ii_node = cgraph_nodes; ii_node != NULL; ii_node = ii_node->next)
+    {
+      if (is_elem_fn (ii_node)
+	  && !DECL_STRUCT_FUNCTION (ii_node->decl)->elem_fn_already_cloned)
+	{
+       	  copied_node = create_elem_fn_nodes (ii_node);
+	  if (DECL_RTL (ii_node->decl))
+	    {
+	      SET_DECL_RTL (copied_node->decl,
+			    copy_rtx (DECL_RTL (ii_node->decl)));
+	      XEXP (DECL_RTL (copied_node->decl), 0) =
+		gen_rtx_SYMBOL_REF
+		(GET_MODE (XEXP (DECL_RTL (ii_node->decl), 0)),
+		 IDENTIFIER_POINTER (DECL_NAME (copied_node->decl)));
+	    }
+	  
+	}
+    }
+  return 0;
+}
+ 
+
+struct gimple_opt_pass pass_elem_fn =
+  {
+    {
+      GIMPLE_PASS,
+      "tree_elem_fn",			/* name */
+      0,				/* gate */
+      create_elem_vec_fn,		/* execute */
+      NULL,				/* sub */
+      NULL,				/* next */
+      0,				/* static_pass_number */
+      TV_NONE,				/* tv_id */
+      PROP_gimple_any| PROP_cfg, 	/* properties_required */
+      0,				/* properties_provided */
+      0,				/* properties_destroyed */
+      0,				/* todo_flags_start */
+      TODO_dump_func|TODO_verify_flow,	/* todo_flags_finish */
+    }
+  };
diff --git a/gcc/function.c b/gcc/function.c
index 4508ae2..ed07a83 100644
--- a/gcc/function.c
+++ b/gcc/function.c
@@ -4418,6 +4418,8 @@  allocate_struct_function (tree fndecl, bool abstract_p)
       /* ??? This could be set on a per-function basis by the front-end
          but is this worth the hassle?  */
       cfun->can_throw_non_call_exceptions = flag_non_call_exceptions;
+
+      cfun->elem_fn_already_cloned = false;
     }
 }
 
diff --git a/gcc/function.h b/gcc/function.h
index b5b032a..d86a958 100644
--- a/gcc/function.h
+++ b/gcc/function.h
@@ -540,6 +540,8 @@  struct GTY(()) function {
 
   /* In a Cilk function, the VAR_DECL for the frame descriptor. */
   tree cilk_frame_decl;
+
+  bool elem_fn_already_cloned;
   
   /* For md files.  */
 
diff --git a/gcc/passes.c b/gcc/passes.c
index 520267a..cb9b552 100644
--- a/gcc/passes.c
+++ b/gcc/passes.c
@@ -1208,6 +1208,7 @@  init_optimization_passes (void)
       NEXT_PASS (pass_lower_vector);
       NEXT_PASS (pass_early_warn_uninitialized);
       NEXT_PASS (pass_rebuild_cgraph_edges);
+      NEXT_PASS (pass_elem_fn);
       NEXT_PASS (pass_inline_parameters);
       NEXT_PASS (pass_early_inline);
       NEXT_PASS (pass_all_early_optimizations);
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index e871aeb..52e72ee 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -484,6 +484,7 @@  extern struct gimple_opt_pass pass_all_optimizations;
 extern struct gimple_opt_pass pass_cleanup_cfg_post_optimizing;
 extern struct gimple_opt_pass pass_init_datastructures;
 extern struct gimple_opt_pass pass_fixup_cfg;
+extern struct gimple_opt_pass pass_elem_fn;
 
 extern struct rtl_opt_pass pass_expand;
 extern struct rtl_opt_pass pass_init_function;