diff mbox

[hsa] Create HSA clones

Message ID 55E97482.9090908@suse.cz
State New
Headers show

Commit Message

Martin Liška Sept. 4, 2015, 10:37 a.m. UTC
Hello.

Following patch adds a new IPA pass that creates clones intended to be
expanded to HSAIL. The pass is capable of LTO stuff.

Thanks,
Martin
diff mbox

Patch

From 8cbddf693f93328f117dc48588deee924d2df6cd Mon Sep 17 00:00:00 2001
From: mliska <mliska@suse.cz>
Date: Tue, 1 Sep 2015 14:10:24 +0200
Subject: [PATCH 1/4] HSA: create HSA clones.

gcc/c-family/ChangeLog:

2015-09-03  Martin Liska  <mliska@suse.cz>

	* c-common.c (handle_hsa_attribute): Do not handle hsakernel attribute.

gcc/lto/ChangeLog:

2015-09-03  Martin Liska  <mliska@suse.cz>

	* lto-partition.c (add_symbol_to_partition_1): For an HSA clone, append
	also all dependencies to a LTO partition.

libgomp/ChangeLog:

2015-09-03  Martin Liska  <mliska@suse.cz>

	* plugin/plugin-hsa.c (GOMP_OFFLOAD_load_image): Enable having a module
	without kernels (can contain HSA functions).

gcc/ChangeLog:

2015-09-03  Martin Liska  <mliska@suse.cz>

	* Makefile.in: Add new source file and remove hsa-gen.c from list
	of GT files.
	* cgraph.h: Remove hsa_imp_of property of cgraph_node.
	* hsa-brig.c (brig_init): Append LTRANS name to a BRIG module name.
	(emit_function_directives): Add new argument.
	(emit_function_declaration): Use it.
	(emit_call_insn): Fill up offsets of functions that should be filled
	before a BRIG module is done.
	(hsa_brig_emit_function): Emit declarations before a function
	is defined/declared.
	(hsa_output_kernel_mapping): An HSA brig module can have zero kernels.
	(hsa_output_brig): Process functions linkage that fills up correct
	code list references.
	* hsa-dump.c: Add new include files due to function_summary.
	* hsa-gen.c (hsa_get_gpu_function): New function.
	(hsa_get_host_function): New function.
	(gen_hsa_insns_for_direct_call): Small refactoring.
	(gen_hsa_insns_for_known_library_call): Likewise.
	(hsa_generate_function_declaration): Sanitize function name.
	(generate_hsa): Remove unused return value.
	(init_hsa_functions): Remove.
	(insert_store_range_dim): Likewise.
	(wrap_hsa_kernel_call): Likewise.
	(wrap_all_hsa_calls): Likewise.
	(pass_gen_hsail::execute): Emit code just for cgraph_nodes that
	is hsa_summaries.
	* hsa-regalloc.c: Include additional header files.
	* hsa.c (hsa_get_declaration_name): Use asm_name as name of function.
	(hsa_register_kernel): New function.
	* hsa.h (enum hsa_function_kind): New enum.
	(struct hsa_function_summary): New.
	(hsa_summary_t::link_functions): Likewise.
	* ipa-hsa.c: New file.
	* lto-section-in.c: Add new section name.
	* lto-streamer.h (enum lto_section_type): Likewise.
	* omp-low.c (expand_parallel_call): Fill up HSA function summary.
	(expand_target_kernel_body): Likewise.
	* passes.c (execute_one_pass): Terminate pass queue if stop execution
	TODO is returned.
	(execute_pass_list_1): Likewise.
	(execute_ipa_pass_list): Likewise.
	* passes.def: Add new IPA pass.
	* timevar.def: Likewise.
	* tree-pass.h: Likewise.
---
 gcc/Makefile.in             |   2 +-
 gcc/c-family/c-common.c     |   9 --
 gcc/cgraph.h                |   4 -
 gcc/hsa-brig.c              |  89 +++++++++---
 gcc/hsa-dump.c              |  40 +++++-
 gcc/hsa-gen.c               | 267 +++++++----------------------------
 gcc/hsa-regalloc.c          |  27 +++-
 gcc/hsa.c                   |  28 ++++
 gcc/hsa.h                   |  62 +++++++++
 gcc/ipa-hsa.c               | 330 ++++++++++++++++++++++++++++++++++++++++++++
 gcc/lto-section-in.c        |   3 +-
 gcc/lto-streamer.h          |   1 +
 gcc/lto/lto-partition.c     |  48 +++++++
 gcc/omp-low.c               |  16 ++-
 gcc/passes.c                |  18 ++-
 gcc/passes.def              |   1 +
 gcc/timevar.def             |   1 +
 gcc/tree-pass.h             |   2 +
 libgomp/plugin/plugin-hsa.c |   2 -
 19 files changed, 681 insertions(+), 269 deletions(-)
 create mode 100644 gcc/ipa-hsa.c

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 1a37630..ea8750b 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1314,6 +1314,7 @@  OBJS = \
 	ipa-icf.o \
 	ipa-icf-gimple.o \
 	ipa-reference.o \
+	ipa-hsa.o \
 	ipa-ref.o \
 	ipa-utils.o \
 	ipa.o \
@@ -2371,7 +2372,6 @@  GTFILES = $(CPP_ID_DATA_H) $(srcdir)/input.h $(srcdir)/coretypes.h \
   $(srcdir)/ipa-devirt.c \
   $(srcdir)/internal-fn.h \
   $(srcdir)/hsa.c \
-  $(srcdir)/hsa-gen.c \
   @all_gtfiles@
 
 # Compute the list of GT header files from the corresponding C sources,
diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c
index a8775ab..df7819f 100644
--- a/gcc/c-family/c-common.c
+++ b/gcc/c-family/c-common.c
@@ -667,10 +667,6 @@  const struct attribute_spec c_common_attribute_table[] =
 			      handle_noinline_attribute, false },
   { "noclone",                0, 0, true,  false, false,
 			      handle_noclone_attribute, false },
-  { "hsa",                    0, 0, true,  false, false,
-			      handle_hsa_attribute, false },
-  { "hsakernel",              0, 0, true,  false, false,
-			      handle_hsa_attribute, false },
   { "hsafunc",                0, 0, true,  false, false,
 			      handle_hsa_attribute, false },
   { "no_icf",                 0, 0, true,  false, false,
@@ -7369,11 +7365,6 @@  handle_hsa_attribute (tree *node, tree name,
 
   TREE_USED (*node) = 1;
   DECL_UNINLINABLE (*node) = 1;
-  if (strcmp ("hsakernel", IDENTIFIER_POINTER (name)) == 0
-      && (TREE_VALUE (tree_last (TYPE_ARG_TYPES (TREE_TYPE (*node))))
-	  == void_type_node))
-    warning (OPT_Wattributes, "%qE attribute on a function with fixed number "
-	     "of argument makes no sense", name);
 
   return NULL_TREE;
 }
diff --git a/gcc/cgraph.h b/gcc/cgraph.h
index c487f10..b742c8c 100644
--- a/gcc/cgraph.h
+++ b/gcc/cgraph.h
@@ -524,10 +524,6 @@  public:
   /* Section name. Again can be private, if allowed.  */
   section_hash_entry *x_section;
 
-  /* TODO: Consider moving this to a summary.
-     The node this HSA node corresponds to.  */
-  symtab_node *hsa_imp_of;
-
  protected:
   /* Dump base fields of symtab nodes to F.  Not to be used directly.  */
   void dump_base (FILE *);
diff --git a/gcc/hsa-brig.c b/gcc/hsa-brig.c
index a375cf2..7daae33 100644
--- a/gcc/hsa-brig.c
+++ b/gcc/hsa-brig.c
@@ -37,7 +37,6 @@  along with GCC; see the file COPYING3.  If not see
 #include "stor-layout.h"
 #include "tree-cfg.h"
 #include "tree-ssa-alias.h"
-#include "machmode.h"
 #include "output.h"
 #include "gimple-expr.h"
 #include "dominance.h"
@@ -51,10 +50,26 @@  along with GCC; see the file COPYING3.  If not see
 #include "gimple-pretty-print.h"
 #include "diagnostic-core.h"
 #include "hash-map.h"
-#include "ipa-ref.h"
 #include "lto-streamer.h"
 #include "cgraph.h"
 #include "real.h"
+#include "gimple-iterator.h"
+#include "bitmap.h"
+#include "dumpfile.h"
+#include "alloc-pool.h"
+#include "tree-ssa-operands.h"
+#include "gimple-ssa.h"
+#include "tree-phinodes.h"
+#include "tree-ssanames.h"
+#include "rtl.h"
+#include "expr.h"
+#include "tree-dfa.h"
+#include "ssa-iterators.h"
+#include "ipa-ref.h"
+#include "gimplify-me.h"
+#include "print-tree.h"
+#include "cfghooks.h"
+#include "symbol-summary.h"
 #include "hsa.h"
 
 #define BRIG_ELF_SECTION_NAME ".brig"
@@ -116,6 +131,9 @@  static bool brig_initialized = false;
 /* Mapping between emitted HSA functions and their offset in code segment.  */
 static hash_map<tree, BrigCodeOffset32_t> *function_offsets;
 
+/* Set of emitted function declarations.  */
+static hash_set <tree> *emitted_declarations;
+
 struct function_linkage_pair
 {
   function_linkage_pair (tree decl, unsigned int off):
@@ -128,6 +146,9 @@  struct function_linkage_pair
   unsigned int offset;
 };
 
+/* Vector of function calls where we need to resolve function offsets.  */
+static auto_vec <function_linkage_pair> function_call_linkage;
+
 /* Add a new chunk, allocate data for it and initialize it.  */
 
 void
@@ -404,6 +425,21 @@  brig_init (void)
       char* extension = strchr (modname, '.');
       if (extension)
 	*extension = '\0';
+
+      /* As in LTO mode, we have to emit a different module names.  */
+      if (flag_ltrans)
+	{
+	  part = strrchr (asm_file_name, '/');
+	  if (!part)
+	    part = asm_file_name;
+	  else
+	    part++;
+	  char *modname2;
+	  asprintf (&modname2, "%s_%s", modname, part);
+	  free (modname);
+	  modname = modname2;
+	}
+
       hsa_sanitize_name (modname);
       moddir.name = brig_emit_string (modname);
       free (modname);
@@ -570,7 +606,7 @@  emit_directive_variable (struct hsa_symbol *symbol)
    definition F.  */
 
 static BrigDirectiveExecutable *
-emit_function_directives (hsa_function_representation *f)
+emit_function_directives (hsa_function_representation *f, bool is_declaration)
 {
   struct BrigDirectiveExecutable fndir;
   unsigned name_offset, inarg_off, scoped_off, next_toplev_off;
@@ -621,7 +657,10 @@  emit_function_directives (hsa_function_representation *f)
     fndir.modifier.allBits |= BRIG_EXECUTABLE_DEFINITION;
   memset (&fndir.reserved, 0, sizeof (fndir.reserved));
 
-  function_offsets->put (f->decl, brig_code.total_size);
+  /* Once we put a definition of function_offsets, we should not overwrite
+     it with a declaration of the function.  */
+  if (!function_offsets->get (f->decl) || !is_declaration)
+    function_offsets->put (f->decl, brig_code.total_size);
 
   brig_code.add (&fndir, sizeof (fndir));
   /* XXX terrible hack: we need to set instCount after we emit all
@@ -1048,7 +1087,7 @@  emit_function_declaration (tree decl)
 {
   hsa_function_representation *f = hsa_generate_function_declaration (decl);
 
-  emit_function_directives (f);
+  emit_function_directives (f, true);
   emit_queued_operands ();
 
   delete f;
@@ -1423,11 +1462,9 @@  emit_call_insn (hsa_insn_basic *insn)
   operand_offsets[0] = htole32 (enqueue_op (call->result_code_list));
 
   /* Operand 1: func */
-  BrigCodeOffset32_t *func_offset = function_offsets->get
-    (call->called_function);
-  gcc_assert (func_offset != NULL);
-  call->func.directive_offset = *func_offset;
   unsigned int offset = enqueue_op (&call->func);
+  function_call_linkage.safe_push
+    (function_linkage_pair (call->called_function, offset));
 
   operand_offsets[1] = htole32 (offset);
   /* Operand 2: in-args.  */
@@ -1746,18 +1783,22 @@  hsa_brig_emit_function (void)
   if (!function_offsets)
     function_offsets = new hash_map<tree, BrigCodeOffset32_t> ();
 
+  if (!emitted_declarations)
+    emitted_declarations = new hash_set<tree> ();
+
   for (unsigned i = 0; i < hsa_cfun->called_functions.length (); i++)
     {
       tree called = hsa_cfun->called_functions[i];
 
-      if (function_offsets->get (called) == NULL)
+      /* If the function has no definition, emit a declaration.  */
+      if (!emitted_declarations->contains (called))
 	{
 	  emit_function_declaration (called);
-	  gcc_assert (function_offsets->get (called) != NULL);
+	  emitted_declarations->add (called);
 	}
     }
 
-  ptr_to_fndir = emit_function_directives (hsa_cfun);
+  ptr_to_fndir = emit_function_directives (hsa_cfun, false);
   for (insn = hsa_bb_for_bb (ENTRY_BLOCK_PTR_FOR_FN (cfun))->first_insn;
        insn;
        insn = insn->next)
@@ -1790,10 +1831,6 @@  hsa_output_kernel_mapping (tree brig_decl)
 {
   unsigned map_count = hsa_get_number_decl_kernel_mappings ();
 
-  /* If the current TU does not contain a kernel, no mapping is produced.  */
-  if (map_count == 0)
-    return;
-
   tree int_num_of_kernels;
   int_num_of_kernels = build_int_cst (uint32_type_node, map_count);
   tree kernel_num_index_type = build_index_type (int_num_of_kernels);
@@ -1804,8 +1841,9 @@  hsa_output_kernel_mapping (tree brig_decl)
   for (unsigned i = 0; i < map_count; ++i)
     {
       tree decl = hsa_get_decl_kernel_mapping_decl (i);
-      CONSTRUCTOR_APPEND_ELT (host_functions_vec, NULL_TREE,
-			      build_fold_addr_expr (decl));
+      CONSTRUCTOR_APPEND_ELT
+	(host_functions_vec, NULL_TREE,
+	 build_fold_addr_expr (hsa_get_host_function (decl)));
     }
   tree host_functions_ctor = build_constructor (host_functions_array_type,
 						host_functions_vec);
@@ -2106,6 +2144,18 @@  hsa_output_brig (void)
   if (!brig_initialized)
     return;
 
+  for (unsigned i = 0; i < function_call_linkage.length (); i++)
+    {
+      function_linkage_pair p = function_call_linkage[i];
+
+      BrigCodeOffset32_t *func_offset = function_offsets->get (p.function_decl);
+      gcc_assert (*func_offset);
+      BrigOperandCodeRef *code_ref = (BrigOperandCodeRef *)
+	(brig_operand.get_ptr_by_offset (p.offset));
+      gcc_assert (code_ref->base.kind == BRIG_KIND_OPERAND_CODE_REF);
+      code_ref->ref = htole32 (*func_offset);
+    }
+
   saved_section = in_section;
 
   switch_to_section (get_section (BRIG_ELF_SECTION_NAME, SECTION_NOTYPE, NULL));
@@ -2178,4 +2228,7 @@  hsa_output_brig (void)
   hsa_free_decl_kernel_mapping ();
   brig_release_data ();
   hsa_deinit_compilation_unit_data ();
+
+  delete emitted_declarations;
+  delete function_offsets;
 }
diff --git a/gcc/hsa-dump.c b/gcc/hsa-dump.c
index 4d78519..134005b 100644
--- a/gcc/hsa-dump.c
+++ b/gcc/hsa-dump.c
@@ -22,27 +22,55 @@  along with GCC; see the file COPYING3.  If not see
 #include "coretypes.h"
 #include "tm.h"
 #include "is-a.h"
-#include "vec.h"
-#include "hash-set.h"
 #include "defaults.h"
 #include "hard-reg-set.h"
-#include "dominance.h"
-#include "cfg.h"
-#include "input.h"
-#include "function.h"
+#include "hash-set.h"
+#include "vec.h"
 #include "symtab.h"
+#include "vec.h"
+#include "input.h"
 #include "alias.h"
 #include "double-int.h"
 #include "inchash.h"
 #include "tree.h"
+#include "tree-pass.h"
 #include "tree-ssa-alias.h"
 #include "internal-fn.h"
 #include "gimple-expr.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "cfganal.h"
+#include "function.h"
 #include "predict.h"
 #include "basic-block.h"
 #include "fold-const.h"
 #include "gimple.h"
+#include "gimple-iterator.h"
+#include "machmode.h"
+#include "output.h"
+#include "function.h"
+#include "bitmap.h"
+#include "dumpfile.h"
 #include "gimple-pretty-print.h"
+#include "diagnostic-core.h"
+#include "alloc-pool.h"
+#include "tree-ssa-operands.h"
+#include "gimple-ssa.h"
+#include "tree-phinodes.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
+#include "rtl.h"
+#include "expr.h"
+#include "tree-dfa.h"
+#include "ssa-iterators.h"
+#include "ipa-ref.h"
+#include "lto-streamer.h"
+#include "cgraph.h"
+#include "stor-layout.h"
+#include "gimplify-me.h"
+#include "print-tree.h"
+#include "cfghooks.h"
+#include "symbol-summary.h"
 #include "hsa.h"
 
 /* Return textual name of TYPE.  */
diff --git a/gcc/hsa-gen.c b/gcc/hsa-gen.c
index 5c53876..5065394 100644
--- a/gcc/hsa-gen.c
+++ b/gcc/hsa-gen.c
@@ -70,7 +70,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "stor-layout.h"
 #include "gimplify-me.h"
 #include "print-tree.h"
-#include "cfghooks.h"
+#include "symbol-summary.h"
 #include "hsa.h"
 #include "cfghooks.h"
 
@@ -662,6 +662,32 @@  get_symbol_for_decl (tree decl)
   return sym;
 }
 
+/* For a given function declaration, return a GPU function
+   of the function.  */
+
+static tree
+hsa_get_gpu_function (tree decl)
+{
+  hsa_function_summary *s = hsa_summaries->get (cgraph_node::get_create (decl));
+  gcc_assert (s->kind != HSA_NONE);
+  gcc_assert (!s->gpu_implementation_p);
+
+  return s->binded_function->decl;
+}
+
+/* For a given HSA function declaration, return a host
+   function declaration.  */
+
+tree
+hsa_get_host_function (tree decl)
+{
+  hsa_function_summary *s = hsa_summaries->get (cgraph_node::get_create (decl));
+  gcc_assert (s->kind != HSA_NONE);
+  gcc_assert (s->gpu_implementation_p);
+
+  return s->binded_function->decl;
+}
+
 /* Create a spill symbol of type TYPE.  */
 
 hsa_symbol *
@@ -2664,7 +2690,8 @@  static void
 gen_hsa_insns_for_direct_call (gimple stmt, hsa_bb *hbb,
 			       vec <hsa_op_reg_p> *ssa_map)
 {
-  hsa_insn_call *call_insn = new hsa_insn_call (gimple_call_fndecl (stmt));
+  tree decl = gimple_call_fndecl (stmt);
+  hsa_insn_call *call_insn = new hsa_insn_call (decl);
   hsa_cfun->called_functions.safe_push (call_insn->called_function);
 
   /* Argument block start.  */
@@ -2702,7 +2729,7 @@  gen_hsa_insns_for_direct_call (gimple stmt, hsa_bb *hbb,
   call_insn->args_code_list = new hsa_op_code_list (args);
   hbb->append_insn (call_insn);
 
-  tree result_type = TREE_TYPE (TREE_TYPE (gimple_call_fndecl (stmt)));
+  tree result_type = TREE_TYPE (TREE_TYPE (decl));
 
   tree result = gimple_call_lhs (stmt);
   hsa_insn_mem *result_insn = NULL;
@@ -2796,8 +2823,7 @@  static bool
 gen_hsa_insns_for_known_library_call (gimple stmt, hsa_bb *hbb,
 				      vec <hsa_op_reg_p> *ssa_map)
 {
-  tree decl = gimple_call_fndecl (stmt);
-  const char *name = hsa_get_declaration_name (decl);
+  const char *name = hsa_get_declaration_name (gimple_call_fndecl (stmt));
 
   if (strcmp (name, "omp_is_initial_device") == 0)
     {
@@ -3474,7 +3500,8 @@  specialop:
 	called = TREE_OPERAND (called, 0);
 	gcc_checking_assert (TREE_CODE (called) == FUNCTION_DECL);
 
-	const char *name = hsa_get_declaration_name (called);
+	const char *name = hsa_get_declaration_name
+	  (hsa_get_gpu_function (called));
 	hsa_add_kernel_dependency (hsa_cfun->decl,
 				   hsa_brig_function_name (name));
 	gen_hsa_insns_for_kernel_call (hbb, as_a <gcall *> (stmt));
@@ -3833,6 +3860,7 @@  hsa_generate_function_declaration (tree decl)
   fun->declaration_p = true;
   fun->decl = decl;
   fun->name = xstrdup (hsa_get_declaration_name (decl));
+  hsa_sanitize_name (fun->name);
 
   gen_function_decl_parameters (fun, decl);
 
@@ -3844,19 +3872,19 @@  hsa_generate_function_declaration (tree decl)
    considered an HSA kernel callable from the host, otherwise it will be
    compiled as an HSA function callable from other HSA code.  */
 
-static unsigned int
+static void
 generate_hsa (bool kernel)
 {
   if (DECL_STATIC_CHAIN (cfun->decl))
     {
       sorry ("HSA does not support nested functions");
-      return 0;
+      return;
     }
   else if (!TYPE_ARG_TYPES (TREE_TYPE (cfun->decl)))
     {
       sorry ("HSA does not support functions with variadic arguments "
 	     "(or unknown return type)");
-      return 0;
+      return;
     }
 
   vec <hsa_op_reg_p> ssa_map = vNULL;
@@ -3879,13 +3907,7 @@  generate_hsa (bool kernel)
 
   if (hsa_cfun->kern_p)
     {
-      cgraph_node *node = cgraph_node::get_create (current_function_decl);
-      tree host_decl;
-      if (node->hsa_imp_of)
-	host_decl = node->hsa_imp_of->decl;
-      else
-	host_decl = current_function_decl;
-      hsa_add_kern_decl_mapping (host_decl, hsa_cfun->name,
+      hsa_add_kern_decl_mapping (current_function_decl, hsa_cfun->name,
 				 hsa_cfun->maximum_omp_data_size);
     }
 
@@ -3903,197 +3925,6 @@  generate_hsa (bool kernel)
 
  fail:
   hsa_deinit_data_for_cfun ();
-  return 0;
-}
-
-static GTY(()) tree hsa_launch_fn;
-static GTY(()) tree hsa_dim_array_type;
-static GTY(()) tree hsa_lattrs_dimnum_decl;
-static GTY(()) tree hsa_lattrs_grid_decl;
-static GTY(()) tree hsa_lattrs_group_decl;
-static GTY(()) tree hsa_lattrs_nargs_decl;
-static GTY(()) tree hsa_launch_attributes_type;
-
-static void
-init_hsa_functions (void)
-{
-  if (hsa_launch_fn)
-    return;
-
-  tree dim_arr_index_type;
-  dim_arr_index_type = build_index_type (build_int_cst (integer_type_node, 2));
-  hsa_dim_array_type = build_array_type (uint32_type_node, dim_arr_index_type);
-
-  hsa_launch_attributes_type = make_node (RECORD_TYPE);
-  hsa_lattrs_dimnum_decl = build_decl (BUILTINS_LOCATION, FIELD_DECL,
-				       get_identifier ("ndim"),
-				       uint32_type_node);
-  DECL_CHAIN (hsa_lattrs_dimnum_decl) = NULL_TREE;
-
-  hsa_lattrs_grid_decl = build_decl (BUILTINS_LOCATION, FIELD_DECL,
-				    get_identifier ("global_size"),
-				    hsa_dim_array_type);
-  DECL_CHAIN (hsa_lattrs_grid_decl) = hsa_lattrs_dimnum_decl;
-  hsa_lattrs_group_decl = build_decl (BUILTINS_LOCATION, FIELD_DECL,
-				     get_identifier ("group_size"),
-				     hsa_dim_array_type);
-  DECL_CHAIN (hsa_lattrs_group_decl) = hsa_lattrs_grid_decl;
-  hsa_lattrs_nargs_decl = build_decl (BUILTINS_LOCATION, FIELD_DECL,
-				      get_identifier ("nargs"),
-				      uint32_type_node);
-  DECL_CHAIN (hsa_lattrs_nargs_decl) = hsa_lattrs_group_decl;
-  finish_builtin_struct (hsa_launch_attributes_type, "__hsa_launch_attributes",
-			 hsa_lattrs_nargs_decl, NULL_TREE);
-  tree launch_fn_type;
-  launch_fn_type
-    = build_function_type_list (void_type_node, ptr_type_node,
-				build_pointer_type (hsa_launch_attributes_type),
-				build_pointer_type (uint64_type_node),
-				NULL_TREE);
-
-  hsa_launch_fn = build_fn_decl ("__hsa_launch_kernel", launch_fn_type);
-}
-
-/* Insert before the current statement in GSI a store of VALUE to INDEX of
-   array (of type hsa_dim_array_type) FLD_DECL of RANGE_VAR.  VALUE must be of
-   type uint32_type_node.  */
-
-static void
-insert_store_range_dim (gimple_stmt_iterator *gsi, tree range_var,
-			tree fld_decl, int index, tree value)
-{
-  tree ref = build4 (ARRAY_REF, uint32_type_node,
-		     build3 (COMPONENT_REF, hsa_dim_array_type,
-			     range_var, fld_decl, NULL_TREE),
-		     build_int_cst (integer_type_node, index),
-		     NULL_TREE, NULL_TREE);
-  gsi_insert_before (gsi, gimple_build_assign (ref, value), GSI_SAME_STMT);
-}
-
-/* Generate call to invoke kernel implementing function FNDECL.  */
-
-static void
-wrap_hsa_kernel_call (gimple_stmt_iterator *gsi, tree fndecl)
-{
-  init_hsa_functions ();
-
-  bool real_kern_p = lookup_attribute ("hsakernel", DECL_ATTRIBUTES (fndecl));
-  tree grid_size_1, group_size_1;
-  tree u32_one = build_int_cst (uint32_type_node, 1);
-  gimple call_stmt = gsi_stmt (*gsi);
-  unsigned discard_arguents, num_args = gimple_call_num_args (call_stmt);
-  if (real_kern_p)
-    {
-      discard_arguents = 2;
-      if (num_args < 2)
-	{
-	  error ("Calls to functions with hsakernel attribute must "
-		 "have at least two arguments.");
-	  grid_size_1 = group_size_1 = u32_one;
-	}
-      else
-	{
-	  grid_size_1 = fold_convert (uint32_type_node,
-				      gimple_call_arg (call_stmt, num_args - 2));
-	  grid_size_1 = force_gimple_operand_gsi (gsi, grid_size_1, true,
-						  NULL_TREE, true,
-						  GSI_SAME_STMT);
-	  group_size_1 = fold_convert (uint32_type_node,
-				       gimple_call_arg (call_stmt,
-							num_args - 1));
-	  group_size_1 = force_gimple_operand_gsi (gsi, group_size_1, true,
-						   NULL_TREE, true,
-						   GSI_SAME_STMT);
-	}
-    }
-  else
-    {
-      discard_arguents = 0;
-      grid_size_1 = build_int_cst (uint32_type_node, 64);
-      group_size_1 = build_int_cst (uint32_type_node, 64);
-    }
-
-  tree lattrs = create_tmp_var (hsa_launch_attributes_type,
-				"__hsa_launch_attrs");
-  tree dimref = build3 (COMPONENT_REF, uint32_type_node,
-			lattrs, hsa_lattrs_dimnum_decl, NULL_TREE);
-  gsi_insert_before (gsi, gimple_build_assign (dimref, u32_one), GSI_SAME_STMT);
-  insert_store_range_dim (gsi, lattrs, hsa_lattrs_grid_decl, 0,
-			  grid_size_1);
-  insert_store_range_dim (gsi, lattrs, hsa_lattrs_grid_decl, 1,
-			  u32_one);
-  insert_store_range_dim (gsi, lattrs, hsa_lattrs_grid_decl, 2,
-			  u32_one);
-  insert_store_range_dim (gsi, lattrs, hsa_lattrs_group_decl, 0,
-			  group_size_1);
-  insert_store_range_dim (gsi, lattrs, hsa_lattrs_group_decl, 1,
-			  u32_one);
-  insert_store_range_dim (gsi, lattrs, hsa_lattrs_group_decl, 2,
-			  u32_one);
-  tree nargsref = build3 (COMPONENT_REF, uint32_type_node,
-			 lattrs, hsa_lattrs_nargs_decl, NULL_TREE);
-  tree nargsval = build_int_cst (uint32_type_node, num_args - discard_arguents);
-  gsi_insert_before (gsi, gimple_build_assign (nargsref, nargsval),
-		     GSI_SAME_STMT);
-  lattrs = build_fold_addr_expr (lattrs);
-
-  tree args;
-  args = create_tmp_var (build_array_type_nelts (uint64_type_node,
-						 num_args - discard_arguents),
-			 NULL);
-
-  gcc_assert (num_args >= discard_arguents);
-  for (unsigned i = 0; i < (num_args - discard_arguents); i++)
-    {
-      tree arg = gimple_call_arg (call_stmt, i);
-      gimple g;
-
-      tree r = build4 (ARRAY_REF, uint64_type_node, args,
-		       size_int (i), NULL_TREE, NULL_TREE);
-
-      arg = force_gimple_operand_gsi (gsi, fold_convert (uint64_type_node, arg),
-				      true, NULL_TREE, true, GSI_SAME_STMT);
-      g = gimple_build_assign (r, arg);
-      gsi_insert_before (gsi, g, GSI_SAME_STMT);
-    }
-
-  args = build_fold_addr_expr (args);
-
-  /* XXX doesn't handle calls with lhs, doesn't remove EH
-     edges.  */
-  gimple launch = gimple_build_call (hsa_launch_fn, 3,
-				     build_fold_addr_expr (fndecl),
-				     lattrs, args);
-  gsi_insert_before (gsi, launch, GSI_SAME_STMT);
-  unlink_stmt_vdef (call_stmt);
-  gsi_remove (gsi, true);
-}
-
-/* Replace calls of functions which have been turned into HSA kernels into
-   their invocation via HSA run-time.  */
-
-static unsigned int
-wrap_all_hsa_calls (void)
-{
-  bool changed = false;
-  basic_block bb;
-  FOR_ALL_BB_FN (bb, cfun)
-    {
-      gimple_stmt_iterator gsi;
-      tree fndecl;
-      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
-	if (is_gimple_call (gsi_stmt (gsi))
-	    && (fndecl = gimple_call_fndecl (gsi_stmt (gsi)))
-	    && (lookup_attribute ("hsa", DECL_ATTRIBUTES (fndecl))
-		|| lookup_attribute ("hsakernel", DECL_ATTRIBUTES (fndecl))))
-	  {
-	    wrap_hsa_kernel_call (&gsi, fndecl);
-	    changed = true;
-	  }
-	else
-	  gsi_next (&gsi);
-    }
-  return changed ? TODO_cleanup_cfg | TODO_update_ssa : 0;
 }
 
 namespace {
@@ -4135,15 +3966,17 @@  pass_gen_hsail::gate (function *)
 unsigned int
 pass_gen_hsail::execute (function *)
 {
-  if (cgraph_node::get_create (current_function_decl)->hsa_imp_of
-      || lookup_attribute ("hsa", DECL_ATTRIBUTES (current_function_decl))
-      || lookup_attribute ("hsakernel",
-			   DECL_ATTRIBUTES (current_function_decl)))
-    return generate_hsa (true);
-  else if (hsa_callable_function_p (current_function_decl))
-    return generate_hsa (false);
-  else
-    return wrap_all_hsa_calls ();
+  hsa_function_summary *s = hsa_summaries->get
+    (cgraph_node::get_create (current_function_decl));
+
+  if (s->gpu_implementation_p)
+    {
+      generate_hsa (s->kind == HSA_KERNEL);
+      TREE_ASM_WRITTEN (current_function_decl) = 1;
+      return TODO_stop_pass_execution;
+    }
+
+  return 0;
 }
 
 } // anon namespace
@@ -4155,5 +3988,3 @@  make_pass_gen_hsail (gcc::context *ctxt)
 {
   return new pass_gen_hsail (ctxt);
 }
-
-#include "gt-hsa-gen.h"
diff --git a/gcc/hsa-regalloc.c b/gcc/hsa-regalloc.c
index 75bcc8a..bb93c35 100644
--- a/gcc/hsa-regalloc.c
+++ b/gcc/hsa-regalloc.c
@@ -27,27 +27,50 @@  along with GCC; see the file COPYING3.  If not see
 #include "hash-set.h"
 #include "vec.h"
 #include "symtab.h"
+#include "vec.h"
 #include "input.h"
 #include "alias.h"
 #include "double-int.h"
 #include "inchash.h"
 #include "tree.h"
+#include "tree-pass.h"
 #include "tree-ssa-alias.h"
 #include "internal-fn.h"
 #include "gimple-expr.h"
 #include "dominance.h"
 #include "cfg.h"
-#include "cfghooks.h"
+#include "cfganal.h"
 #include "function.h"
 #include "predict.h"
 #include "basic-block.h"
 #include "fold-const.h"
 #include "gimple.h"
+#include "gimple-iterator.h"
+#include "machmode.h"
+#include "output.h"
+#include "function.h"
 #include "bitmap.h"
 #include "dumpfile.h"
 #include "gimple-pretty-print.h"
 #include "diagnostic-core.h"
-#include "cfganal.h"
+#include "alloc-pool.h"
+#include "tree-ssa-operands.h"
+#include "gimple-ssa.h"
+#include "tree-phinodes.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
+#include "rtl.h"
+#include "expr.h"
+#include "tree-dfa.h"
+#include "ssa-iterators.h"
+#include "ipa-ref.h"
+#include "lto-streamer.h"
+#include "cgraph.h"
+#include "stor-layout.h"
+#include "gimplify-me.h"
+#include "print-tree.h"
+#include "cfghooks.h"
+#include "symbol-summary.h"
 #include "hsa.h"
 
 
diff --git a/gcc/hsa.c b/gcc/hsa.c
index 4ad44fe..017b4ca 100644
--- a/gcc/hsa.c
+++ b/gcc/hsa.c
@@ -70,6 +70,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "stor-layout.h"
 #include "gimplify-me.h"
 #include "print-tree.h"
+#include "symbol-summary.h"
 #include "hsa.h"
 
 /* Structure containing intermediate HSA representation of the generated
@@ -100,6 +101,9 @@  hash_map <tree, vec <char *> *> *hsa_decl_kernel_dependencies;
 /* Hash function to lookup a symbol for a decl.  */
 hash_table <hsa_free_symbol_hasher> *hsa_global_variable_symbols;
 
+/* HSA summaries.  */
+hsa_summary_t *hsa_summaries = NULL;
+
 /* True if compilation unit-wide data are already allocated and initialized.  */
 static bool compilation_unit_data_initialized;
 
@@ -464,10 +468,34 @@  hsa_get_declaration_name (tree decl)
       free (b);
       return ggc_str;
     }
+  else if (TREE_CODE (decl) == FUNCTION_DECL)
+    return cgraph_node::get_create (decl)->asm_name ();
   else
     return IDENTIFIER_POINTER (DECL_NAME (decl));
 
   return NULL;
 }
 
+/* Add a HOST function to HSA summaries.  */
+
+void
+hsa_register_kernel (cgraph_node *host)
+{
+  if (hsa_summaries == NULL)
+    hsa_summaries = new hsa_summary_t (symtab);
+  hsa_function_summary *s = hsa_summaries->get (host);
+  s->kind = HSA_KERNEL;
+}
+
+/* Add a pair of functions to HSA summaries.  GPU is an HSA implementation of
+   a HOST function.  */
+
+void
+hsa_register_kernel (cgraph_node *gpu, cgraph_node *host)
+{
+  if (hsa_summaries == NULL)
+    hsa_summaries = new hsa_summary_t (symtab);
+  hsa_summaries->link_functions (gpu, host, HSA_KERNEL);
+}
+
 #include "gt-hsa.h"
diff --git a/gcc/hsa.h b/gcc/hsa.h
index 8ebfcaa..c6cd124 100644
--- a/gcc/hsa.h
+++ b/gcc/hsa.h
@@ -889,10 +889,69 @@  public:
   unsigned maximum_omp_data_size;
 };
 
+enum hsa_function_kind
+{
+  HSA_NONE,
+  HSA_KERNEL,
+  HSA_FUNCTION
+};
+
+struct hsa_function_summary
+{
+  /* Default constructor.  */
+  hsa_function_summary ();
+
+  /* Kind of GPU/hostfunction.  */
+  hsa_function_kind kind;
+
+  /* Pointer to a cgraph node which is a HSA implementation of the function.
+     In case of the function is a HSA function, the binded function points
+     to the host function.  */
+  cgraph_node *binded_function;
+
+  /* Identifies if the function is an HSA function or a host function.  */
+  bool gpu_implementation_p;
+};
+
+inline
+hsa_function_summary::hsa_function_summary (): kind (HSA_NONE),
+  binded_function (NULL), gpu_implementation_p (false)
+{
+}
+
+/* Function summary for HSA functions.  */
+class hsa_summary_t: public function_summary <hsa_function_summary *>
+{
+public:
+  hsa_summary_t (symbol_table *table):
+    function_summary<hsa_function_summary *> (table) { }
+
+  void link_functions (cgraph_node *gpu, cgraph_node *host,
+		       hsa_function_kind kind);
+};
+
+inline void
+hsa_summary_t::link_functions (cgraph_node *gpu, cgraph_node *host,
+			       hsa_function_kind kind)
+{
+  hsa_function_summary *gpu_summary = get (gpu);
+  hsa_function_summary *host_summary = get (host);
+
+  gpu_summary->kind = kind;
+  host_summary->kind = kind;
+
+  gpu_summary->gpu_implementation_p = true;
+  host_summary->gpu_implementation_p = false;
+
+  gpu_summary->binded_function = host;
+  host_summary->binded_function = gpu;
+}
+
 /* in hsa.c */
 extern struct hsa_function_representation *hsa_cfun;
 extern hash_table <hsa_free_symbol_hasher> *hsa_global_variable_symbols;
 extern hash_map <tree, vec <char *> *> *hsa_decl_kernel_dependencies;
+extern hsa_summary_t *hsa_summaries;
 extern unsigned hsa_kernel_calls_counter;
 bool hsa_callable_function_p (tree fndecl);
 void hsa_init_compilation_unit_data (void);
@@ -915,6 +974,8 @@  void hsa_add_kernel_dependency (tree caller, char *called_function);
 void hsa_sanitize_name (char *p);
 char *hsa_brig_function_name (const char *p);
 const char *hsa_get_declaration_name (tree decl);
+void hsa_register_kernel (cgraph_node *host);
+void hsa_register_kernel (cgraph_node *gpu, cgraph_node *host);
 
 /* In hsa-gen.c.  */
 void hsa_build_append_simple_mov (hsa_op_reg *, hsa_op_base *, hsa_bb *);
@@ -924,6 +985,7 @@  hsa_op_reg *hsa_spill_in (hsa_insn_basic *, hsa_op_reg *, hsa_op_reg **);
 hsa_op_reg *hsa_spill_out (hsa_insn_basic *, hsa_op_reg *, hsa_op_reg **);
 hsa_bb *hsa_init_new_bb (basic_block);
 hsa_function_representation *hsa_generate_function_declaration (tree decl);
+tree hsa_get_host_function (tree decl);
 
 /* In hsa-regalloc.c.  */
 void hsa_regalloc (void);
diff --git a/gcc/ipa-hsa.c b/gcc/ipa-hsa.c
new file mode 100644
index 0000000..24d3fe4
--- /dev/null
+++ b/gcc/ipa-hsa.c
@@ -0,0 +1,330 @@ 
+/* Callgraph based analysis of static variables.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Martin Liska <mliska@suse.cz>
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Interprocedural HSA pass is responsible for creation of HSA clones.
+   For all these HSA clones, we emit HSAIL instructions and pass processing
+   is terminated.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "is-a.h"
+#include "defaults.h"
+#include "hard-reg-set.h"
+#include "hash-set.h"
+#include "vec.h"
+#include "symtab.h"
+#include "vec.h"
+#include "input.h"
+#include "alias.h"
+#include "double-int.h"
+#include "inchash.h"
+#include "tree.h"
+#include "tree-pass.h"
+#include "tree-ssa-alias.h"
+#include "internal-fn.h"
+#include "gimple-expr.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "cfganal.h"
+#include "function.h"
+#include "predict.h"
+#include "basic-block.h"
+#include "fold-const.h"
+#include "gimple.h"
+#include "gimple-iterator.h"
+#include "machmode.h"
+#include "output.h"
+#include "function.h"
+#include "bitmap.h"
+#include "dumpfile.h"
+#include "gimple-pretty-print.h"
+#include "tree-streamer.h"
+#include "diagnostic-core.h"
+#include "alloc-pool.h"
+#include "tree-ssa-operands.h"
+#include "gimple-ssa.h"
+#include "tree-phinodes.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
+#include "rtl.h"
+#include "expr.h"
+#include "tree-dfa.h"
+#include "ssa-iterators.h"
+#include "ipa-ref.h"
+#include "lto-streamer.h"
+#include "cgraph.h"
+#include "stor-layout.h"
+#include "gimplify-me.h"
+#include "print-tree.h"
+#include "cfghooks.h"
+#include "symbol-summary.h"
+#include "hsa.h"
+
+namespace {
+
+static unsigned int
+process_hsa_functions (void)
+{
+  struct cgraph_node *node;
+
+  if (hsa_summaries == NULL)
+    hsa_summaries = new hsa_summary_t (symtab);
+
+  FOR_EACH_DEFINED_FUNCTION (node)
+    {
+      hsa_function_summary *s = hsa_summaries->get (node);
+
+      /* A linked function is skipped.  */
+      if (s->binded_function != NULL)
+	continue;
+
+      if (s->kind != HSA_NONE)
+	{
+	  cgraph_node *clone = node->create_virtual_clone
+	    (vec <cgraph_edge *> (), NULL, NULL, "hsa");
+
+	  clone->force_output = true;
+	  hsa_summaries->link_functions (clone, node, s->kind);
+
+	  if (dump_file)
+	    fprintf (dump_file, "HSA creates a new clone: %s, type: %s\n",
+		     clone->name (),
+		     s->kind == HSA_KERNEL ? "kernel" : "function");
+	}
+      else if (hsa_callable_function_p (node->decl))
+	{
+	  cgraph_node *clone = node->create_virtual_clone
+	    (vec <cgraph_edge *> (), NULL, NULL, "hsa");
+
+	  hsa_summaries->link_functions (clone, node, HSA_FUNCTION);
+
+	  if (dump_file)
+	    fprintf (dump_file, "HSA creates a new function clone: %s\n",
+		     clone->name ());
+	}
+    }
+
+  /* Redirect all edges that are between HSA clones.  */
+  FOR_EACH_DEFINED_FUNCTION (node)
+    {
+      cgraph_edge *e = node->callees;
+
+      while (e)
+	{
+	  hsa_function_summary *src = hsa_summaries->get (node);
+	  if (src->kind != HSA_NONE && src->gpu_implementation_p)
+	    {
+	      hsa_function_summary *dst = hsa_summaries->get (e->callee);
+	      if (dst->kind != HSA_NONE && !dst->gpu_implementation_p)
+		{
+		  e->redirect_callee (dst->binded_function);
+		  if (dump_file)
+		    fprintf (dump_file,
+			     "Redirecting edge to HSA function: %s->%s\n",
+			     xstrdup_for_dump (e->caller->name ()),
+			     xstrdup_for_dump (e->callee->name ()));
+		}
+	    }
+
+	  e = e->next_callee;
+	}
+    }
+
+  return 0;
+}
+
+static void
+ipa_hsa_write_summary (void)
+{
+  struct bitpack_d bp;
+  struct cgraph_node *node;
+  struct output_block *ob;
+  unsigned int count = 0;
+  lto_symtab_encoder_iterator lsei;
+  lto_symtab_encoder_t encoder;
+
+  if (!hsa_summaries)
+    return;
+
+  ob = create_output_block (LTO_section_ipa_hsa);
+  encoder = ob->decl_state->symtab_node_encoder;
+  ob->symbol = NULL;
+  for (lsei = lsei_start_function_in_partition (encoder); !lsei_end_p (lsei);
+       lsei_next_function_in_partition (&lsei))
+    {
+      node = lsei_cgraph_node (lsei);
+      hsa_function_summary *s = hsa_summaries->get (node);
+
+      if (s->kind != HSA_NONE)
+	count++;
+    }
+
+  streamer_write_uhwi (ob, count);
+
+  /* Process all of the functions.  */
+  for (lsei = lsei_start_function_in_partition (encoder); !lsei_end_p (lsei);
+       lsei_next_function_in_partition (&lsei))
+    {
+      node = lsei_cgraph_node (lsei);
+      hsa_function_summary *s = hsa_summaries->get (node);
+
+      if (s->kind != HSA_NONE)
+	{
+	  encoder = ob->decl_state->symtab_node_encoder;
+	  int node_ref = lto_symtab_encoder_encode (encoder, node);
+	  streamer_write_uhwi (ob, node_ref);
+
+	  bp = bitpack_create (ob->main_stream);
+	  bp_pack_value (&bp, s->kind, 2);
+	  bp_pack_value (&bp, s->gpu_implementation_p, 1);
+	  bp_pack_value (&bp, s->binded_function != NULL, 1);
+	  streamer_write_bitpack (&bp);
+	  if (s->binded_function)
+	    stream_write_tree (ob, s->binded_function->decl, true);
+	}
+    }
+
+  streamer_write_char_stream (ob->main_stream, 0);
+  produce_asm (ob, NULL);
+  destroy_output_block (ob);
+}
+
+static void
+ipa_hsa_read_section (struct lto_file_decl_data *file_data, const char *data,
+		       size_t len)
+{
+  const struct lto_function_header *header =
+    (const struct lto_function_header *) data;
+  const int cfg_offset = sizeof (struct lto_function_header);
+  const int main_offset = cfg_offset + header->cfg_size;
+  const int string_offset = main_offset + header->main_size;
+  struct data_in *data_in;
+  unsigned int i;
+  unsigned int count;
+
+  lto_input_block ib_main ((const char *) data + main_offset,
+			   header->main_size, file_data->mode_table);
+
+  data_in =
+    lto_data_in_create (file_data, (const char *) data + string_offset,
+			header->string_size, vNULL);
+  count = streamer_read_uhwi (&ib_main);
+
+  for (i = 0; i < count; i++)
+    {
+      unsigned int index;
+      struct cgraph_node *node;
+      lto_symtab_encoder_t encoder;
+
+      index = streamer_read_uhwi (&ib_main);
+      encoder = file_data->symtab_node_encoder;
+      node = dyn_cast<cgraph_node *> (lto_symtab_encoder_deref (encoder,
+								index));
+      gcc_assert (node->definition);
+      hsa_function_summary *s = hsa_summaries->get (node);
+
+      struct bitpack_d bp = streamer_read_bitpack (&ib_main);
+      s->kind = (hsa_function_kind) bp_unpack_value (&bp, 2);
+      s->gpu_implementation_p = bp_unpack_value (&bp, 1);
+      bool has_tree = bp_unpack_value (&bp, 1);
+
+      if (has_tree)
+	{
+	  tree decl = stream_read_tree (&ib_main, data_in);
+	  s->binded_function = cgraph_node::get_create (decl);
+	}
+    }
+  lto_free_section_data (file_data, LTO_section_ipa_hsa, NULL, data,
+			 len);
+  lto_data_in_delete (data_in);
+}
+
+static void
+ipa_hsa_read_summary (void)
+{
+  struct lto_file_decl_data **file_data_vec = lto_get_file_decl_data ();
+  struct lto_file_decl_data *file_data;
+  unsigned int j = 0;
+
+  if (hsa_summaries == NULL)
+    hsa_summaries = new hsa_summary_t (symtab);
+
+  while ((file_data = file_data_vec[j++]))
+    {
+      size_t len;
+      const char *data = lto_get_section_data (file_data, LTO_section_ipa_hsa,
+					       NULL, &len);
+
+      if (data)
+	ipa_hsa_read_section (file_data, data, len);
+    }
+}
+
+const pass_data pass_data_ipa_hsa =
+{
+  IPA_PASS, /* type */
+  "hsa", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_IPA_HSA, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_dump_symtab, /* todo_flags_finish */
+};
+
+class pass_ipa_hsa : public ipa_opt_pass_d
+{
+public:
+  pass_ipa_hsa (gcc::context *ctxt)
+    : ipa_opt_pass_d (pass_data_ipa_hsa, ctxt,
+		      NULL, /* generate_summary */
+		      ipa_hsa_write_summary, /* write_summary */
+		      ipa_hsa_read_summary, /* read_summary */
+		      ipa_hsa_write_summary, /* write_optimization_summary */
+		      ipa_hsa_read_summary, /* read_optimization_summary */
+		      NULL, /* stmt_fixup */
+		      0, /* function_transform_todo_flags_start */
+		      NULL, /* function_transform */
+		      NULL) /* variable_transform */
+    {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *);
+
+  virtual unsigned int execute (function *) { return process_hsa_functions (); }
+
+}; // class pass_ipa_reference
+
+bool
+pass_ipa_hsa::gate (function *)
+{
+  return hsa_gen_requested_p () || in_lto_p;
+}
+
+} // anon namespace
+
+ipa_opt_pass_d *
+make_pass_ipa_hsa (gcc::context *ctxt)
+{
+  return new pass_ipa_hsa (ctxt);
+}
diff --git a/gcc/lto-section-in.c b/gcc/lto-section-in.c
index 58560a8..468777a 100644
--- a/gcc/lto-section-in.c
+++ b/gcc/lto-section-in.c
@@ -68,7 +68,8 @@  const char *lto_section_name[LTO_N_SECTION_TYPES] =
   "ipcp_trans",
   "icf",
   "offload_table",
-  "mode_table"
+  "mode_table",
+  "hsa"
 };
 
 
diff --git a/gcc/lto-streamer.h b/gcc/lto-streamer.h
index 66a824e..b4455a1 100644
--- a/gcc/lto-streamer.h
+++ b/gcc/lto-streamer.h
@@ -244,6 +244,7 @@  enum lto_section_type
   LTO_section_ipa_icf,
   LTO_section_offload_table,
   LTO_section_mode_table,
+  LTO_section_ipa_hsa,
   LTO_N_SECTION_TYPES		/* Must be last.  */
 };
 
diff --git a/gcc/lto/lto-partition.c b/gcc/lto/lto-partition.c
index 8e5b555..01a60b2 100644
--- a/gcc/lto/lto-partition.c
+++ b/gcc/lto/lto-partition.c
@@ -44,6 +44,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "ipa-utils.h"
 #include "lto-partition.h"
 #include "stringpool.h"
+#include "hsa.h"
 
 vec<ltrans_partition> ltrans_partitions;
 
@@ -180,6 +181,53 @@  add_symbol_to_partition_1 (ltrans_partition part, symtab_node *node)
 	 Therefore put it into the same partition.  */
       if (cnode->instrumented_version)
 	add_symbol_to_partition_1 (part, cnode->instrumented_version);
+
+      /* Add an HSA associated with the symbol.  */
+      if (hsa_summaries != NULL)
+	{
+	  hsa_function_summary *s = hsa_summaries->get (cnode);
+	  if (s->kind != HSA_NONE)
+	    {
+	      /* Add binded function.  */
+	      bool added = add_symbol_to_partition_1 (part, s->binded_function);
+	      gcc_assert (added);
+	      if (symtab->dump_file)
+		fprintf (symtab->dump_file,
+			 "adding an HSA function (host/gpu) to the "
+			 "partition: %s\n",
+			 s->binded_function->name ());
+
+	      ipa_ref *ref;
+
+	      /* Add all parents nodes that have HSA type.  */
+	      for (unsigned i = 0; node->iterate_referring (i, ref); i++)
+		{
+		  cgraph_node *r = dyn_cast <cgraph_node *> (ref->referring);
+		  if (r && hsa_summaries->get (r)->kind != HSA_NONE)
+		    {
+		      add_symbol_to_partition_1 (part, r);
+		      if (symtab->dump_file)
+			fprintf (symtab->dump_file,
+				 "adding an HSA referring node: %s\n",
+				 r->name ());
+		    }
+		}
+
+	      /* Add all children nodes that have HSA type.  */
+	      for (unsigned i = 0; node->iterate_reference (i, ref); i++)
+		{
+		  cgraph_node *r = dyn_cast <cgraph_node *> (ref->referred);
+		  if (r && hsa_summaries->get (r)->kind != HSA_NONE)
+		    {
+		      add_symbol_to_partition_1 (part, r);
+		      if (symtab->dump_file)
+			fprintf (symtab->dump_file,
+				 "adding an HSA referred symbol: %s\n",
+				 r->name ());
+		    }
+		}
+	    }
+	}
     }
 
   add_references_to_partition (part, node);
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index d6c521f..2cbd4e8 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -81,6 +81,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "context.h"
 #include "lto-section-names.h"
 #include "gomp-constants.h"
+#include "symbol-summary.h"
 #include "hsa.h"
 
 
@@ -5236,7 +5237,7 @@  gimple_build_cond_empty (tree cond)
    target region that has not been turned into a simple GPGPU kernel.  */
 
 static bool
-region_part_of_unkernelized_tartget_p (struct omp_region *region)
+region_part_of_unkernelized_target_p (struct omp_region *region)
 {
   if (lookup_attribute ("omp declare target",
 			DECL_ATTRIBUTES (current_function_decl)))
@@ -5429,10 +5430,11 @@  expand_parallel_call (struct omp_region *region, basic_block bb,
 			    false, GSI_CONTINUE_LINKING);
 
   if (hsa_gen_requested_p ()
-      && region_part_of_unkernelized_tartget_p (region))
+      && region_part_of_unkernelized_target_p (region))
     {
       cgraph_node *child_cnode = cgraph_node::get (child_fndecl);
-      child_cnode->hsa_imp_of = child_cnode;
+      hsa_register_kernel (child_cnode);
+
       /* FIXME: Flatten should be set on HSA-only clones created by an IPA
 	 pass.  */
       DECL_ATTRIBUTES (child_fndecl)
@@ -10010,7 +10012,8 @@  expand_target_kernel_body (struct omp_region *target)
     {
       gcc_assert (!tgt_stmt->kernel_iter);
       cgraph_node *n = cgraph_node::get (orig_child_fndecl);
-      n->hsa_imp_of = n;
+
+      hsa_register_kernel (n);
       /* FIXME: Flatten should be set on HSA-only clones created by an IPA
 	 pass.  */
       DECL_ATTRIBUTES (orig_child_fndecl)
@@ -10075,7 +10078,10 @@  expand_target_kernel_body (struct omp_region *target)
 
   cgraph_node *kcn = cgraph_node::get_create (kern_fndecl);
   kcn->mark_force_output ();
-  kcn->hsa_imp_of = cgraph_node::get (orig_child_fndecl);
+  cgraph_node *orig_child = cgraph_node::get (orig_child_fndecl);
+
+  hsa_register_kernel (kcn, orig_child);
+
   /* FIXME: Flatten should be set on HSA-only clones created by an IPA
      pass.  */
   DECL_ATTRIBUTES (kern_fndecl)
diff --git a/gcc/passes.c b/gcc/passes.c
index 1b677ac..86768e0 100644
--- a/gcc/passes.c
+++ b/gcc/passes.c
@@ -2257,7 +2257,7 @@  override_gate_status (opt_pass *pass, tree func, bool gate_status)
 /* Execute PASS. */
 
 bool
-execute_one_pass (opt_pass *pass)
+execute_one_pass (opt_pass *pass, bool *exit)
 {
   unsigned int todo_after = 0;
 
@@ -2362,18 +2362,28 @@  execute_one_pass (opt_pass *pass)
   if (!((todo_after | pass->todo_flags_finish) & TODO_do_not_ggc_collect))
     ggc_collect ();
 
+  /* If finish TODO flags contain TODO_stop_pass_execution, set exit = true.  */
+  if (todo_after & TODO_stop_pass_execution)
+    *exit = true;
+
   return true;
 }
 
 static void
 execute_pass_list_1 (opt_pass *pass)
 {
+  bool stop_pass_execution = false;
+
   do
     {
       gcc_assert (pass->type == GIMPLE_PASS
 		  || pass->type == RTL_PASS);
-      if (execute_one_pass (pass) && pass->sub)
+      if (execute_one_pass (pass, &stop_pass_execution) && pass->sub)
         execute_pass_list_1 (pass->sub);
+
+      if (stop_pass_execution)
+	return;
+
       pass = pass->next;
     }
   while (pass);
@@ -2714,12 +2724,14 @@  ipa_read_optimization_summaries (void)
 void
 execute_ipa_pass_list (opt_pass *pass)
 {
+  bool stop_pass_execution;
+
   do
     {
       gcc_assert (!current_function_decl);
       gcc_assert (!cfun);
       gcc_assert (pass->type == SIMPLE_IPA_PASS || pass->type == IPA_PASS);
-      if (execute_one_pass (pass) && pass->sub)
+      if (execute_one_pass (pass, &stop_pass_execution) && pass->sub)
 	{
 	  if (pass->sub->type == GIMPLE_PASS)
 	    {
diff --git a/gcc/passes.def b/gcc/passes.def
index 60bb6eb..3999fbb 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -127,6 +127,7 @@  along with GCC; see the file COPYING3.  If not see
   NEXT_PASS (pass_ipa_inline);
   NEXT_PASS (pass_ipa_pure_const);
   NEXT_PASS (pass_ipa_reference);
+  NEXT_PASS (pass_ipa_hsa);
   /* This pass needs to be scheduled after any IP code duplication.   */
   NEXT_PASS (pass_ipa_single_use);
   /* Comdat privatization come last, as direct references to comdat local
diff --git a/gcc/timevar.def b/gcc/timevar.def
index ac41075..705f6a8 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -94,6 +94,7 @@  DEFTIMEVAR (TV_WHOPR_WPA_IO          , "whopr wpa I/O")
 DEFTIMEVAR (TV_WHOPR_PARTITIONING    , "whopr partitioning")
 DEFTIMEVAR (TV_WHOPR_LTRANS          , "whopr ltrans")
 DEFTIMEVAR (TV_IPA_REFERENCE         , "ipa reference")
+DEFTIMEVAR (TV_IPA_HSA		     , "ipa HSA")
 DEFTIMEVAR (TV_IPA_PROFILE           , "ipa profile")
 DEFTIMEVAR (TV_IPA_AUTOFDO           , "auto profile")
 DEFTIMEVAR (TV_IPA_PURE_CONST        , "ipa pure const")
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 91c44a7..0f084f7 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -295,6 +295,7 @@  protected:
 
 /* Rebuild the callgraph edges.  */
 #define TODO_rebuild_cgraph_edges       (1 << 22)
+#define TODO_stop_pass_execution	(1 << 23)
 
 /* Internally used in execute_function_todo().  */
 #define TODO_update_ssa_any		\
@@ -480,6 +481,7 @@  extern ipa_opt_pass_d *make_pass_ipa_cp (gcc::context *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_icf (gcc::context *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_devirt (gcc::context *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_reference (gcc::context *ctxt);
+extern ipa_opt_pass_d *make_pass_ipa_hsa (gcc::context *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_pure_const (gcc::context *ctxt);
 extern simple_ipa_opt_pass *make_pass_ipa_pta (gcc::context *ctxt);
 extern simple_ipa_opt_pass *make_pass_ipa_tm (gcc::context *ctxt);
diff --git a/libgomp/plugin/plugin-hsa.c b/libgomp/plugin/plugin-hsa.c
index d318d52..f9be015 100644
--- a/libgomp/plugin/plugin-hsa.c
+++ b/libgomp/plugin/plugin-hsa.c
@@ -473,8 +473,6 @@  GOMP_OFFLOAD_load_image (int ord, unsigned version  __attribute__ ((unused)),
   if (agent->prog_finalized)
     destroy_hsa_program (agent);
 
-  if (kernel_count == 0)
-    GOMP_PLUGIN_fatal ("No kernels encountered in a brig module description");
   if (debug)
     fprintf (stderr, "Encountered %d kernels in an image\n", kernel_count);
   pair = GOMP_PLUGIN_malloc (kernel_count * sizeof (struct addr_pair));
-- 
2.4.6