diff mbox

[hsa] Add support for kernel from kernel dispatching

Message ID 55BB2DBE.10108@suse.cz
State New
Headers show

Commit Message

Martin Liška July 31, 2015, 8:11 a.m. UTC
Hello.

Following patch implements $subject, however to fully enable the functionality,
omp-low.c must be reverted. That planned operation will be sent soon.

Thanks,
Martin
diff mbox

Patch

From 7833b863218c35479fe40af2dff2e6fde1bd3a48 Mon Sep 17 00:00:00 2001
From: mliska <mliska@suse.cz>
Date: Thu, 30 Jul 2015 16:49:47 +0200
Subject: [PATCH] HSA: add support for kernel from kernel dispatching.

gcc/ChangeLog:

2015-07-31  Martin Liska  <mliska@suse.cz>

	* hsa-brig.c (brig_emit_string): Add new argument.
	(brig_init): Sanitize module names.
	(emit_memory_insn): Make emission more flexible in case of number
	of operands.
	(emit_atomic_insn): Likewise.
	(emit_addr_insn): Likewise.
	(emit_signal_insn): New function.
	(emit_segment_insn): Fix comment coding style.
	(emit_cmp_insn): Likewise.
	(emit_branch_insn): Likewise.
	(emit_call_insn): Likewise.
	(emit_comment_insn): New function.
	(emit_queue_insn): Likewise.
	(emit_basic_insn): Fix comment coding style.
	(emit_insn): Add handling of newly added insn types.
	(perhaps_emit_branch): Fix comment coding style.
	(hsa_output_kernel_mapping): Serialize kernel dependencies.
	* hsa-dump.c (dump_hsa_imm_or_reg): Remove.
	(dump_hsa_operand): New function.
	(dump_hsa_operands): Likewise.
	(dump_hsa_insn): Add dump support of hsa_insn_comment.
	(debug_hsa_insn): Fix segfault.
	* hsa-gen.c (struct hsa_kernel_dispatch): New structure.
	(struct hsa_queue_packet): Likewise.
	(struct hsa_queue): Likewise.
	(set_reg_def): More forward in the source file.
	(hsa_append_insn): Likewise.
	(hsa_function_representation::hsa_function_representation): Add
	shadow register.
	(hsa_function_representation::get_shadow_reg): New function.
	(hsa_init_data_for_cfun): Allocate new memory pools.
	(hsa_deinit_data_for_cfun): Release newly added memory pools.
	(hsa_insn_basic::hsa_insn_basic): Enhance ctor.
	(hsa_insn_mem::hsa_insn_mem): Likewise.
	(hsa_insn_atomic::hsa_insn_atomic): Make ctor more generic.
	(hsa_insn_comment::hsa_insn_comment): New function.
	(hsa_insn_comment::release_string): Likewise.
	(get_unsigned_type_by_bytes): New function.
	(gen_hsa_memory_copy): Likewise.
	(gen_hsa_insns_for_kernel_call): Likewise.
	(gen_hsa_insns_for_call): Use changed ctors.
	(gen_function_def_parameters): Add support for shadow arguments.
	* hsa-regalloc.c (hsa_num_def_ops): Fix indentation.
	* hsa.c (hsa_add_kernel_dependency): New function.
	(hsa_sanitize_name): Sanitize another character.
	(hsa_brig_function_name): New function.

libgomp/ChangeLog:

2015-07-31  Martin Liska  <mliska@suse.cz>

	* hsa-traits.h: New file.
	* plugin/plugin-hsa.c (struct brig_image_desc): Add support for kernel
	dependencies.
	(struct kernel_info): Likewise.
	(struct module_info): Likewise.
	(get_kernel_in_module): New function.
	(GOMP_OFFLOAD_load_image): Assign a kernel to a module.
	(create_kernel_dispatch): New function.
	(release_kernel_dispatch): New function.
	(init_kernel): Fix GNU coding style.
	(indent_stream): New function.
	(print_kernel_dispatch): New function.
	(create_kernel_dispatch_recursive): New function.
	(GOMP_OFFLOAD_run): Handle kernel dependencies in a kernel dispatch
	operation.
	(destroy_module): Fix GNU coding style.
---
 gcc/hsa-brig.c              | 249 +++++++++++--
 gcc/hsa-dump.c              |  93 +++--
 gcc/hsa-gen.c               | 889 ++++++++++++++++++++++++++++++++++++++++++--
 gcc/hsa-regalloc.c          |   8 +-
 gcc/hsa.c                   |  43 ++-
 gcc/hsa.h                   | 102 ++++-
 libgomp/hsa-traits.h        |  52 +++
 libgomp/plugin/plugin-hsa.c | 315 ++++++++++++++--
 8 files changed, 1611 insertions(+), 140 deletions(-)
 create mode 100644 libgomp/hsa-traits.h

diff --git a/gcc/hsa-brig.c b/gcc/hsa-brig.c
index 2aac30a..27c41a5 100644
--- a/gcc/hsa-brig.c
+++ b/gcc/hsa-brig.c
@@ -307,10 +307,11 @@  brig_string_slot_hasher::remove (value_type ds)
 static hash_table<brig_string_slot_hasher> *brig_string_htab;
 
 /* Emit a null terminated string STR to the data section and return its
-   offset in it.  If PREFIX is non-zero, output it just before STR too.  */
+   offset in it.  If PREFIX is non-zero, output it just before STR too.
+   Sanitize the string if SANITIZE option is set to true.  */
 
 static unsigned
-brig_emit_string (const char *str, char prefix = 0)
+brig_emit_string (const char *str, char prefix = 0, bool sanitize = true)
 {
   unsigned slen = strlen (str);
   unsigned offset, len = slen + (prefix ? 1 : 0);
@@ -321,7 +322,9 @@  brig_emit_string (const char *str, char prefix = 0)
 
   /* XXX Sanitize the names without all the strdup.  */
   str2 = xstrdup (str);
-  hsa_sanitize_name (str2);
+
+  if (sanitize)
+    hsa_sanitize_name (str2);
   s_slot.s = str2;
   s_slot.len = slen;
   s_slot.prefix = prefix;
@@ -401,6 +404,7 @@  brig_init (void)
       char* extension = strchr (modname, '.');
       if (extension)
 	*extension = '\0';
+      hsa_sanitize_name (modname);
       moddir.name = brig_emit_string (modname);
       free (modname);
     }
@@ -1042,7 +1046,7 @@  emit_memory_insn (hsa_insn_mem *mem)
 
   operand_offsets[0] = htole32 (enqueue_op (mem->operands[0]));
   operand_offsets[1] = htole32 (enqueue_op (mem->operands[1]));
-  /* We have two operands so use 4 * 2 for the byteCount */
+  /* We have two operands so use 4 * 2 for the byteCount.  */
   byteCount = htole32 (4 * 2);
 
   repr.base.operands = htole32 (brig_data.add (&byteCount, sizeof (byteCount)));
@@ -1065,6 +1069,47 @@  emit_memory_insn (hsa_insn_mem *mem)
   brig_insn_count++;
 }
 
+/* Emit an HSA signal memory instruction and all necessary directives, schedule
+   necessary operands for writing.  */
+
+static void
+emit_signal_insn (hsa_insn_signal *mem)
+{
+  struct BrigInstSignal repr;
+  BrigOperandOffset32_t *operand_offsets = XCNEWVEC (BrigOperandOffset32_t,
+						     mem->operands.length ());
+  uint32_t byteCount;
+
+  /* This is necessary because of the erroneous typedef of
+     BrigMemoryModifier8_t which introduces padding which may then contain
+     random stuff (which we do not want so that we can test things don't
+     change).  */
+  memset (&repr, 0, sizeof (repr));
+  repr.base.base.byteCount = htole16 (sizeof (repr));
+  repr.base.base.kind = htole16 (BRIG_KIND_INST_SIGNAL);
+  repr.base.opcode = htole16 (mem->opcode);
+  repr.base.type = htole16 (mem->type);
+
+  for (unsigned i = 0; i < mem->operands.length (); i++)
+    operand_offsets[i] = htole32 (enqueue_op (mem->operands[i]));
+
+  /* We have N operands so use 4 * N for the byteCount.  */
+  byteCount = htole32 (4 * mem->operands.length ());
+
+  repr.base.operands = htole32 (brig_data.add (&byteCount, sizeof (byteCount)));
+  brig_data.add (operand_offsets, sizeof (BrigOperandOffset32_t) *
+		 mem->operands.length ());
+  brig_data.round_size_up (4);
+  free (operand_offsets);
+
+  repr.memoryOrder = mem->memoryorder;
+  repr.signalOperation = mem->atomicop;
+  repr.signalType = BRIG_TYPE_SIG64;
+
+  brig_code.add (&repr, sizeof (repr));
+  brig_insn_count++;
+}
+
 /* Emit an HSA atomic memory instruction and all necessary directives, schedule
    necessary operands for writing .  */
 
@@ -1072,10 +1117,16 @@  static void
 emit_atomic_insn (hsa_insn_atomic *mem)
 {
   struct BrigInstAtomic repr;
-  BrigOperandOffset32_t operand_offsets[4];
+  BrigOperandOffset32_t *operand_offsets = XCNEWVEC (BrigOperandOffset32_t,
+						     mem->operands.length ());
   uint32_t byteCount;
 
-  hsa_op_address *addr = as_a <hsa_op_address *> (mem->operands[1]);
+  /* Either operand[0] or operand[1] must be an address operand.  */
+  hsa_op_address *addr = NULL;
+  if (is_a <hsa_op_address *> (mem->operands[0]))
+    addr = as_a <hsa_op_address *> (mem->operands[0]);
+  else
+    addr = as_a <hsa_op_address *> (mem->operands[1]);
 
   /* This is necessary because of the erroneous typedef of
      BrigMemoryModifier8_t which introduces padding which may then contain
@@ -1087,17 +1138,17 @@  emit_atomic_insn (hsa_insn_atomic *mem)
   repr.base.opcode = htole16 (mem->opcode);
   repr.base.type = htole16 (mem->type);
 
-  operand_offsets[0] = htole32 (enqueue_op (mem->operands[0]));
-  operand_offsets[1] = htole32 (enqueue_op (mem->operands[1]));
-  operand_offsets[2] = htole32 (enqueue_op (mem->operands[2]));
-  operand_offsets[3] = htole32 (enqueue_op (mem->operands[3]));
+  for (unsigned i = 0; i < mem->operands.length (); i++)
+    operand_offsets[i] = htole32 (enqueue_op (mem->operands[i]));
 
-  /* We have 4 operands so use 4 * 4 for the byteCount */
-  byteCount = htole32 (4 * 4);
+  /* We have N operands so use 4 * N for the byteCount.  */
+  byteCount = htole32 (4 * mem->operands.length ());
 
   repr.base.operands = htole32 (brig_data.add (&byteCount, sizeof (byteCount)));
-  brig_data.add (&operand_offsets, sizeof (operand_offsets));
+  brig_data.add (operand_offsets, sizeof (BrigOperandOffset32_t) *
+		 mem->operands.length ());
   brig_data.round_size_up (4);
+  free (operand_offsets);
 
   if (addr->symbol)
     repr.segment = addr->symbol->segment;
@@ -1118,7 +1169,8 @@  static void
 emit_addr_insn (hsa_insn_basic *insn)
 {
   struct BrigInstAddr repr;
-  BrigOperandOffset32_t operand_offsets[2];
+  BrigOperandOffset32_t *operand_offsets = XCNEWVEC (BrigOperandOffset32_t,
+						     insn->operands.length ());
   uint32_t byteCount;
 
   hsa_op_address *addr = as_a <hsa_op_address *> (insn->operands[1]);
@@ -1128,15 +1180,16 @@  emit_addr_insn (hsa_insn_basic *insn)
   repr.base.opcode = htole16 (insn->opcode);
   repr.base.type = htole16 (insn->type);
 
-  operand_offsets[0] = htole32 (enqueue_op (insn->operands[0]));
-  operand_offsets[1] = htole32 (enqueue_op (insn->operands[1]));
+  for (unsigned i = 0; i < insn->operands.length (); i++)
+    operand_offsets[i] = htole32 (enqueue_op (insn->operands[i]));
 
-  /* We have two operands so use 4 * 2 for the byteCount */
-  byteCount = htole32 (4 * 2);
+  /* We have N operands so use 4 * N for the byteCount.  */
+  byteCount = htole32 (4 * insn->operands.length ());
 
   repr.base.operands = htole32 (brig_data.add (&byteCount, sizeof (byteCount)));
-  brig_data.add (&operand_offsets, sizeof (operand_offsets));
+  brig_data.add (operand_offsets, sizeof (operand_offsets));
   brig_data.round_size_up (4);
+  free (operand_offsets);
 
   if (addr->symbol)
     repr.segment = addr->symbol->segment;
@@ -1166,7 +1219,7 @@  emit_segment_insn (hsa_insn_seg *seg)
   operand_offsets[0] = htole32 (enqueue_op (seg->operands[0]));
   operand_offsets[1] = htole32 (enqueue_op (seg->operands[1]));
 
-  /* We have two operands so use 4 * 2 for the byteCount */
+  /* We have two operands so use 4 * 2 for the byteCount.  */
   byteCount = htole32 (4 * 2);
 
   repr.base.operands = htole32 (brig_data.add (&byteCount, sizeof (byteCount)));
@@ -1201,7 +1254,7 @@  emit_cmp_insn (hsa_insn_cmp *cmp)
   operand_offsets[0] = htole32 (enqueue_op (cmp->operands[0]));
   operand_offsets[1] = htole32 (enqueue_op (cmp->operands[1]));
   operand_offsets[2] = htole32 (enqueue_op (cmp->operands[2]));
-  /* We have three operands so use 4 * 3 for the byteCount */
+  /* We have three operands so use 4 * 3 for the byteCount.  */
   byteCount = htole32 (4 * 3);
 
   repr.base.operands = htole32 (brig_data.add (&byteCount, sizeof (byteCount)));
@@ -1255,7 +1308,7 @@  emit_branch_insn (hsa_insn_br *br)
   operand_offsets[1] = htole32 (enqueue_op
 				(&hsa_bb_for_bb (target)->label_ref));
 
-  /* We have 2 operands so use 4 * 2 for the byteCount */
+  /* We have 2 operands so use 4 * 2 for the byteCount.  */
   byteCount = htole32 (4 * 2);
   repr.base.operands = htole32 (brig_data.add (&byteCount, sizeof (byteCount)));
   brig_data.add (&operand_offsets, sizeof (operand_offsets));
@@ -1345,7 +1398,7 @@  emit_call_insn (hsa_insn_basic *insn)
   /* Operand 2: in-args.  */
   operand_offsets[2] = htole32 (enqueue_op (call->args_code_list));
 
-  /* We have 3 operands so use 3 * 4 for the byteCount */
+  /* We have 3 operands so use 3 * 4 for the byteCount.  */
   byteCount = htole32 (3 * 4);
   repr.base.operands = htole32 (brig_data.add (&byteCount, sizeof (byteCount)));
   brig_data.add (&operand_offsets, sizeof (operand_offsets));
@@ -1402,6 +1455,53 @@  emit_arg_block_insn (hsa_insn_arg_block *insn)
   brig_insn_count++;
 }
 
+/* Emit comment directive.  */
+
+static void
+emit_comment_insn (hsa_insn_comment *insn)
+{
+  struct BrigDirectiveComment repr;
+  repr.base.byteCount = htole16 (sizeof (repr));
+  repr.base.kind = htole16 (insn->opcode);
+  repr.name = brig_emit_string (insn->comment, '\0', false);
+  brig_code.add (&repr, sizeof (repr));
+  insn->release_string ();
+}
+
+/* Emit queue instruction INSN.  */
+
+static void
+emit_queue_insn (hsa_insn_queue *insn)
+{
+  BrigInstQueue repr;
+  auto_vec<BrigOperandOffset32_t, HSA_BRIG_INT_STORAGE_OPERANDS>
+    operand_offsets;
+  uint32_t byteCount, operand_count = insn->operands.length ();
+
+  repr.base.base.byteCount = htole16 (sizeof (BrigInstQueue));
+  repr.base.base.kind = htole16 (BRIG_KIND_INST_QUEUE);
+  repr.base.opcode = htole16 (insn->opcode);
+  repr.base.type = htole16 (insn->type);
+  repr.segment = BRIG_SEGMENT_GLOBAL;
+  repr.memoryOrder = BRIG_MEMORY_ORDER_SC_RELEASE;
+
+  operand_offsets.safe_grow_cleared (operand_count);
+  for (unsigned i = 0; i < operand_count; i++)
+    {
+      gcc_checking_assert (insn->operands[i]);
+      operand_offsets[i] = htole32 (enqueue_op (insn->operands[i]));
+    }
+
+  byteCount = htole32 (4 * operand_count) ;
+  repr.base.operands = htole32 (brig_data.add (&byteCount, sizeof (byteCount)));
+  brig_data.add (operand_offsets.address (),
+		 operand_count * sizeof (BrigOperandOffset32_t));
+  brig_data.round_size_up (4);
+  brig_code.add (&repr, sizeof (repr));
+
+  brig_insn_count++;
+}
+
 /* Emit a basic HSA instruction and all necessary directives, schedule
    necessary operands for writing.  */
 
@@ -1411,7 +1511,8 @@  emit_basic_insn (hsa_insn_basic *insn)
   /* We assume that BrigInstMod has a BrigInstBasic prefix.  */
   struct BrigInstMod repr;
   BrigType16_t type;
-  auto_vec<BrigOperandOffset32_t, HSA_BRIG_INT_STORAGE_OPERANDS> operand_offsets;
+  auto_vec<BrigOperandOffset32_t, HSA_BRIG_INT_STORAGE_OPERANDS>
+    operand_offsets;
   uint32_t byteCount, operand_count = insn->operands.length ();
 
   if (insn->opcode == BRIG_OPCODE_CVT)
@@ -1482,6 +1583,11 @@  static void
 emit_insn (hsa_insn_basic *insn)
 {
   gcc_assert (!is_a <hsa_insn_phi *> (insn));
+  if (hsa_insn_signal *signal = dyn_cast <hsa_insn_signal *> (insn))
+    {
+      emit_signal_insn (signal);
+      return;
+    }
   if (hsa_insn_atomic *atom = dyn_cast <hsa_insn_atomic *> (insn))
     {
       emit_atomic_insn (atom);
@@ -1522,6 +1628,16 @@  emit_insn (hsa_insn_basic *insn)
       emit_call_insn (call);
       return;
     }
+  if (hsa_insn_comment *comment = dyn_cast <hsa_insn_comment *> (insn))
+    {
+      emit_comment_insn (comment);
+      return;
+    }
+  if (hsa_insn_queue *queue = dyn_cast <hsa_insn_queue *> (insn))
+    {
+      emit_queue_insn (queue);
+      return;
+    }
   emit_basic_insn (insn);
 }
 
@@ -1564,7 +1680,7 @@  perhaps_emit_branch (basic_block bb, basic_block next_bb)
   repr.width = BRIG_WIDTH_ALL;
 
   operand_offsets[0] = htole32 (enqueue_op (&hsa_bb_for_bb (ff)->label_ref));
-  /* We have 1 operand so use 4 * 1 for the byteCount */
+  /* We have 1 operand so use 4 * 1 for the byteCount.  */
   byteCount = htole32 (4 * 1);
   repr.base.operands = htole32 (brig_data.add (&byteCount, sizeof (byteCount)));
   brig_data.add (&operand_offsets, sizeof (operand_offsets));
@@ -1675,6 +1791,7 @@  hsa_output_kernel_mapping (tree brig_decl)
     }
   len++;
 
+  /* Kernel mapping is a list of string names terminated by '\0'.  */
   char *buf = XNEWVEC (char, len);
   char *p = buf;
   for (unsigned i = 0; i < map_count; ++i)
@@ -1695,6 +1812,76 @@  hsa_output_kernel_mapping (tree brig_decl)
 					     build_index_type (size_int (len)));
   free (buf);
 
+  /* Kernel dependencies is a list of lists, where for a given kernel A all
+     direct kernel dispatches to A1, A2, .., An are organized are joined by '.'
+     character and each entry in the list is separated by '\0'.  */
+
+  len = 0;
+  for (unsigned i = 0; i < map_count; ++i)
+    {
+      tree caller = hsa_get_decl_kernel_mapping_decl (i);
+
+      if (hsa_decl_kernel_dependencies)
+	{
+	  vec<char *> **slot = hsa_decl_kernel_dependencies->get (caller);
+	  if (slot)
+	    {
+	      vec<char *> *s = *slot;
+	      for (unsigned i = 0; i < s->length (); i++)
+		len += (strlen ((*s)[i]) + 1);
+
+	      /* Add N-1 dot characters.  */
+	      len += 2 * (s->length () - 1);
+	    }
+	}
+
+      /* Zero termination character.  */
+      ++len;
+    }
+
+  buf = XNEWVEC (char, len);
+  p = buf;
+  for (unsigned i = 0; i < map_count; ++i)
+    {
+      tree caller = hsa_get_decl_kernel_mapping_decl (i);
+
+      if (hsa_decl_kernel_dependencies)
+	{
+	  vec<char *> **slot = hsa_decl_kernel_dependencies->get (caller);
+	  if (slot)
+	    {
+	      vec<char *> *s = *slot;
+	      unsigned k, j = 0;
+	      for (k = 0; k < s->length (); k++)
+		{
+		  unsigned ll = strlen ((*s)[k]);
+		  gcc_assert (ll > 0);
+		  memcpy (p, (*s)[k], ll);
+		  p += ll;
+		  *p = '\0';
+		  p++;
+
+		  /* If it is not a last elements, append '.' string.  */
+		  if (j != s->length () - 1)
+		    {
+		      *p++ = '.';
+		      *p++ = '\0';
+		    }
+
+		  j++;
+		}
+	    }
+	}
+
+      /* Zero termination character.  */
+      *p++ = '\0';
+    }
+
+  tree kern_dependencies = build_string (len, buf);
+  TREE_TYPE (kern_dependencies) = build_array_type
+    (char_type_node, build_index_type (size_int (len)));
+  free (buf);
+
   tree hsa_image_desc_type = make_node (RECORD_TYPE);
   tree id_f1 = build_decl (BUILTINS_LOCATION, FIELD_DECL,
 			   get_identifier ("brig_module"), ptr_type_node);
@@ -1702,7 +1889,11 @@  hsa_output_kernel_mapping (tree brig_decl)
   tree id_f2 = build_decl (BUILTINS_LOCATION, FIELD_DECL,
 			   get_identifier ("kern_names"), ptr_type_node);
   DECL_CHAIN (id_f2) = id_f1;
-  finish_builtin_struct (hsa_image_desc_type, "__hsa_image_desc", id_f2,
+  tree id_f3 = build_decl (BUILTINS_LOCATION, FIELD_DECL,
+			   get_identifier ("kern_dependencies"),
+			   ptr_type_node);
+  DECL_CHAIN (id_f3) = id_f2;
+  finish_builtin_struct (hsa_image_desc_type, "__hsa_image_desc", id_f3,
 			 NULL_TREE);
 
   vec<constructor_elt, va_gc> *img_desc_vec = NULL;
@@ -1712,6 +1903,12 @@  hsa_output_kernel_mapping (tree brig_decl)
 			  build1 (ADDR_EXPR,
 				  build_pointer_type (TREE_TYPE (kern_names)),
 				  kern_names));
+  CONSTRUCTOR_APPEND_ELT (img_desc_vec, NULL_TREE,
+			  build1 (ADDR_EXPR,
+				  build_pointer_type (TREE_TYPE
+						      (kern_dependencies)),
+				  kern_dependencies));
+
 
   tree img_desc_ctor = build_constructor (hsa_image_desc_type, img_desc_vec);
 
diff --git a/gcc/hsa-dump.c b/gcc/hsa-dump.c
index e1adcbc..58f4c10 100644
--- a/gcc/hsa-dump.c
+++ b/gcc/hsa-dump.c
@@ -653,18 +653,6 @@  dump_hsa_immed (FILE *f, hsa_op_immed *imm)
   fprintf (f, " (%s)", hsa_type_name (imm->type));
 }
 
-/* Dump textual representation of HSA IL register or immediate value OP to file
-   F.  */
-
-static void
-dump_hsa_imm_or_reg (FILE *f, hsa_op_base *op, bool dump_reg_type = false)
-{
-  if (is_a <hsa_op_reg *> (op))
-    dump_hsa_reg (f, as_a <hsa_op_reg *> (op), dump_reg_type);
-  else
-    dump_hsa_immed (f, as_a <hsa_op_immed *> (op));
-}
-
 /* Dump textual representation of HSA IL address operand ADDR to file F.  */
 
 static void
@@ -695,6 +683,35 @@  dump_hsa_address (FILE *f, hsa_op_address *addr)
     fprintf (f, "[" HOST_WIDE_INT_PRINT_DEC "]", addr->imm_offset);
 }
 
+/* Dump textual representation of HSA IL operand OP to file F.  */
+
+static void
+dump_hsa_operand (FILE *f, hsa_op_base *op, bool dump_reg_type = false)
+{
+  if (is_a <hsa_op_immed *> (op))
+    dump_hsa_immed (f, as_a <hsa_op_immed *> (op));
+  else if (is_a <hsa_op_reg *> (op))
+    dump_hsa_reg (f, as_a <hsa_op_reg *> (op), dump_reg_type);
+  else if (is_a <hsa_op_address *> (op))
+    dump_hsa_address (f, as_a <hsa_op_address *> (op));
+  else
+    fprintf (f, "UNKNOWN_OP_KIND");
+}
+
+/* Dump textual representation of HSA IL operands in VEC to file F.  */
+
+static void
+dump_hsa_operands (FILE *f, vec <hsa_op_base *> &operands,
+		   bool dump_reg_type = false)
+{
+  for (unsigned i = 0; i < operands.length (); i++)
+    {
+      dump_hsa_operand (f, operands[i], dump_reg_type);
+      if (i != operands.length () - 1)
+	fprintf (f, ", ");
+    }
+}
+
 /* Indent F stream with INDENT spaces.  */
 
 static void indent_stream (FILE *f, int indent)
@@ -729,14 +746,34 @@  dump_hsa_insn (FILE *f, hsa_insn_basic *insn, int *indent)
 	    fprintf (f, ", ");
 	  else
 	    first = false;
-	  dump_hsa_imm_or_reg (f, phi->operands[i], true);
+	  dump_hsa_operand (f, phi->operands[i], true);
 	}
       fprintf (f, ">\n");
     }
+  else if (is_a <hsa_insn_signal *> (insn))
+    {
+      hsa_insn_signal *mem = as_a <hsa_insn_signal *> (insn);
+
+      fprintf (f, "%s", hsa_opcode_name (mem->opcode));
+      fprintf (f, "_%s", hsa_atomicop_name (mem->atomicop));
+      if (mem->memoryorder != BRIG_MEMORY_ORDER_NONE)
+	fprintf (f, "_%s", hsa_memsem_name (mem->memoryorder));
+      fprintf (f, "_%s ", hsa_type_name (mem->type));
+
+      dump_hsa_operands (f, mem->operands);
+      fprintf (f, "\n");
+    }
+
   else if (is_a <hsa_insn_atomic *> (insn))
     {
       hsa_insn_atomic *mem = as_a <hsa_insn_atomic *> (insn);
-      hsa_op_address *addr = as_a <hsa_op_address *> (mem->operands[1]);
+
+      /* Either operand[0] or operand[1] must be an address operand.  */
+      hsa_op_address *addr = NULL;
+      if (is_a <hsa_op_address *> (mem->operands[0]))
+	addr = as_a <hsa_op_address *> (mem->operands[0]);
+      else
+	addr = as_a <hsa_op_address *> (mem->operands[1]);
 
       fprintf (f, "%s", hsa_opcode_name (mem->opcode));
       fprintf (f, "_%s", hsa_atomicop_name (mem->atomicop));
@@ -748,16 +785,7 @@  dump_hsa_insn (FILE *f, hsa_insn_basic *insn, int *indent)
 	fprintf (f, "_%s", hsa_memscope_name (mem->memoryscope));
       fprintf (f, "_%s ", hsa_type_name (mem->type));
 
-      dump_hsa_imm_or_reg (f, mem->operands[0]);
-      fprintf (f, ", ");
-      dump_hsa_address (f, addr);
-      fprintf (f, ", ");
-      dump_hsa_imm_or_reg (f, mem->operands[2]);
-      if (mem->atomicop == BRIG_ATOMIC_CAS)
-	{
-	  fprintf (f, ", ");
-	  dump_hsa_imm_or_reg (f, mem->operands[3]);
-	}
+      dump_hsa_operands (f, mem->operands);
       fprintf (f, "\n");
     }
   else if (is_a <hsa_insn_mem *> (insn))
@@ -776,7 +804,7 @@  dump_hsa_insn (FILE *f, hsa_insn_basic *insn, int *indent)
 	fprintf (f, "_equiv(%i)", mem->equiv_class);
       fprintf (f, "_%s ", hsa_type_name (mem->type));
 
-      dump_hsa_imm_or_reg (f, mem->operands[0]);
+      dump_hsa_operand (f, mem->operands[0]);
       fprintf (f, ", ");
       dump_hsa_address (f, addr);
       fprintf (f, "\n");
@@ -790,7 +818,7 @@  dump_hsa_insn (FILE *f, hsa_insn_basic *insn, int *indent)
 	fprintf (f, "_%s", hsa_seg_name (addr->symbol->segment));
       fprintf (f, "_%s ", hsa_type_name (insn->type));
 
-      dump_hsa_imm_or_reg (f, insn->operands[0]);
+      dump_hsa_operand (f, insn->operands[0]);
       fprintf (f, ", ");
       dump_hsa_address (f, addr);
       fprintf (f, "\n");
@@ -803,7 +831,7 @@  dump_hsa_insn (FILE *f, hsa_insn_basic *insn, int *indent)
 	       hsa_type_name (seg->type), hsa_type_name (seg->src_type));
       dump_hsa_reg (f, as_a <hsa_op_reg *> (seg->operands[0]));
       fprintf (f, ", ");
-      dump_hsa_imm_or_reg (f, seg->operands[1]);
+      dump_hsa_operand (f, seg->operands[1]);
       fprintf (f, "\n");
     }
   else if (is_a <hsa_insn_cmp *> (insn))
@@ -821,9 +849,9 @@  dump_hsa_insn (FILE *f, hsa_insn_basic *insn, int *indent)
 	       hsa_type_name (cmp->type), hsa_type_name (src_type));
       dump_hsa_reg (f, as_a <hsa_op_reg *> (cmp->operands[0]));
       fprintf (f, ", ");
-      dump_hsa_imm_or_reg (f, cmp->operands[1]);
+      dump_hsa_operand (f, cmp->operands[1]);
       fprintf (f, ", ");
-      dump_hsa_imm_or_reg (f, cmp->operands[2]);
+      dump_hsa_operand (f, cmp->operands[2]);
       fprintf (f, "\n");
     }
   else if (is_a <hsa_insn_br *> (insn))
@@ -886,6 +914,10 @@  dump_hsa_insn (FILE *f, hsa_insn_basic *insn, int *indent)
 	}
       fprintf (f, ")\n");
     }
+  else if (is_a <hsa_insn_comment *> (insn))
+    {
+      fprintf (f, "%s\n", as_a <hsa_insn_comment *> (insn)->comment);
+    }
   else
     {
       bool first = true;
@@ -991,7 +1023,8 @@  dump_hsa_cfun (FILE *f)
 DEBUG_FUNCTION void
 debug_hsa_insn (hsa_insn_basic *insn)
 {
-  dump_hsa_insn (stderr, insn, 0);
+  int indentation = 0;
+  dump_hsa_insn (stderr, insn, &indentation);
 }
 
 /* Dump textual representation of HSA IL in HBB to stderr.  */
diff --git a/gcc/hsa-gen.c b/gcc/hsa-gen.c
index 1274865..bfa1ace 100644
--- a/gcc/hsa-gen.c
+++ b/gcc/hsa-gen.c
@@ -72,6 +72,70 @@  along with GCC; see the file COPYING3.  If not see
 #include "print-tree.h"
 #include "hsa.h"
 
+/* Following structures are defined in the final version
+   of HSA specification.  */
+
+/* HSA kernel dispatch is collection of informations needed for
+   a kernel dispatch.  */
+
+struct hsa_kernel_dispatch
+{
+  /* Pointer to a command queue associated with a kernel dispatch agent.  */
+  void *queue;
+  /* Pointer to reserved memory for OMP data struct copying.  */
+  void *omp_data_memory;
+  /* Pointer to a memory space used for kernel arguments passing.  */
+  void *kernarg_address;
+  /* Kernel object.  */
+  uint64_t object;
+  /* Synchronization signal used for dispatch synchronization.  */
+  uint64_t signal;
+  /* Private segment size.  */
+  uint32_t private_segment_size;
+  /* Group segment size.  */
+  uint32_t group_segment_size;
+  /* Number of children kernel dispatches.  */
+  uint64_t kernel_dispatch_count;
+  /* Debug purpose argument.  */
+  uint64_t debug;
+  /* Kernel dispatch structures created for children kernel dispatches.  */
+  struct hsa_kernel_dispatch **children_dispatches;
+};
+
+/* HSA queue packet is shadow structure, originally provided by AMD.  */
+
+struct hsa_queue_packet
+{
+  uint16_t header;
+  uint16_t setup;
+  uint16_t workgroup_size_x;
+  uint16_t workgroup_size_y;
+  uint16_t workgroup_size_z;
+  uint16_t reserved0;
+  uint32_t grid_size_x;
+  uint32_t grid_size_y;
+  uint32_t grid_size_z;
+  uint32_t private_segment_size;
+  uint32_t group_segment_size;
+  uint64_t kernel_object;
+  void *kernarg_address;
+  uint64_t reserved2;
+  uint64_t completion_signal;
+};
+
+/* HSA queue is shadow structure, originally provided by AMD.  */
+
+struct hsa_queue
+{
+  int type;
+  uint32_t features;
+  void *base_address;
+  uint64_t doorbell_signal;
+  uint32_t size;
+  uint32_t reserved1;
+  uint64_t id;
+};
+
 /* Alloc pools for allocating basic hsa structures such as operands,
    instructions and other basic entities.s */
 static object_allocator<hsa_op_address> *hsa_allocp_operand_address;
@@ -82,11 +146,14 @@  static object_allocator<hsa_insn_basic> *hsa_allocp_inst_basic;
 static object_allocator<hsa_insn_phi> *hsa_allocp_inst_phi;
 static object_allocator<hsa_insn_mem> *hsa_allocp_inst_mem;
 static object_allocator<hsa_insn_atomic> *hsa_allocp_inst_atomic;
+static object_allocator<hsa_insn_signal> *hsa_allocp_inst_signal;
 static object_allocator<hsa_insn_seg> *hsa_allocp_inst_seg;
 static object_allocator<hsa_insn_cmp> *hsa_allocp_inst_cmp;
 static object_allocator<hsa_insn_br> *hsa_allocp_inst_br;
 static object_allocator<hsa_insn_call> *hsa_allocp_inst_call;
 static object_allocator<hsa_insn_arg_block> *hsa_allocp_inst_arg_block;
+static object_allocator<hsa_insn_comment> *hsa_allocp_inst_comment;
+static object_allocator<hsa_insn_queue> *hsa_allocp_inst_queue;
 static object_allocator<hsa_bb> *hsa_allocp_bb;
 static object_allocator<hsa_symbol> *hsa_allocp_symbols;
 
@@ -95,6 +162,39 @@  static object_allocator<hsa_symbol> *hsa_allocp_symbols;
 static vec <hsa_op_code_list *> hsa_list_operand_code_list;
 static vec <hsa_op_reg *> hsa_list_operand_reg;
 
+/* Set the defining instruction of REG to be INSN.  When checking, make sure it
+   was not set before.  */
+
+static inline void
+set_reg_def (hsa_op_reg *reg, hsa_insn_basic *insn)
+{
+  if (hsa_cfun->in_ssa)
+    {
+      gcc_checking_assert (!reg->def_insn);
+      reg->def_insn = insn;
+    }
+  else
+    reg->def_insn = NULL;
+}
+
+/* Append HSA instruction INSN to basic block HBB.  */
+
+static void
+hsa_append_insn (hsa_bb *hbb, hsa_insn_basic *insn)
+{
+  /* Make sure we did not forget to set the kind.  */
+  gcc_assert (!insn->bb);
+
+  insn->bb = hbb->bb;
+  insn->prev = hbb->last_insn;
+  insn->next = NULL;
+  if (hbb->last_insn)
+    hbb->last_insn->next = insn;
+  hbb->last_insn = insn;
+  if (!hbb->first_insn)
+    hbb->first_insn = insn;
+}
+
 /* Constructor of class representing global HSA function/kernel information and
    state.  */
 
@@ -117,6 +217,8 @@  hsa_function_representation::hsa_function_representation ()
   kern_p = false;
   declaration_p = false;
   called_functions = vNULL;
+  shadow_reg = NULL;
+  kernel_dispatch_count = 0;
 }
 
 /* Destructor of class holding function/kernel-wide informaton and state.  */
@@ -134,6 +236,36 @@  hsa_function_representation::~hsa_function_representation ()
   called_functions.release ();
 }
 
+hsa_op_reg *
+hsa_function_representation::get_shadow_reg ()
+{
+  gcc_assert (kern_p);
+
+  if (shadow_reg)
+    return shadow_reg;
+
+  /* Append the shadow argument.  */
+  hsa_symbol *shadow = &input_args[input_args_count++];
+  shadow->type = BRIG_TYPE_U64;
+  shadow->segment = BRIG_SEGMENT_KERNARG;
+  shadow->linkage = BRIG_LINKAGE_FUNCTION;
+  shadow->name = "hsa_runtime_shadow";
+
+  hsa_insn_mem *mem = new (hsa_allocp_inst_mem)
+    hsa_insn_mem (BRIG_OPCODE_LD, BRIG_TYPE_U64);
+
+  hsa_op_reg *r = new (hsa_allocp_operand_reg) hsa_op_reg (BRIG_TYPE_U64);
+
+  mem->operands[0] = r;
+  mem->operands[1] = new (hsa_allocp_operand_address)
+    hsa_op_address (shadow, NULL, 0);
+  set_reg_def (r, mem);
+  hsa_append_insn (&prologue, mem);
+  shadow_reg = r;
+
+  return r;
+}
+
 /* Allocate HSA structures that we need only while generating with this.  */
 
 static void
@@ -158,6 +290,8 @@  hsa_init_data_for_cfun ()
     = new object_allocator<hsa_insn_mem> ("HSA memory instructions", 32);
   hsa_allocp_inst_atomic
     = new object_allocator<hsa_insn_atomic> ("HSA atomic instructions", 32);
+  hsa_allocp_inst_signal
+    = new object_allocator<hsa_insn_signal> ("HSA signal instructions", 32);
   hsa_allocp_inst_seg
     = new object_allocator<hsa_insn_seg> ("HSA segment conversion instructions",
 					  16);
@@ -170,6 +304,12 @@  hsa_init_data_for_cfun ()
   hsa_allocp_inst_arg_block
     = new object_allocator<hsa_insn_arg_block> ("HSA arg block instructions",
 						16);
+  hsa_allocp_inst_comment
+    = new object_allocator<hsa_insn_comment> ("HSA comment instructions",
+						16);
+  hsa_allocp_inst_queue
+    = new object_allocator<hsa_insn_queue> ("HSA queue instructions",
+						16);
   hsa_allocp_bb = new object_allocator<hsa_bb> ("HSA basic blocks", 8);
 
   sym_init_len = (vec_safe_length (cfun->local_decls) / 2) + 1;
@@ -219,11 +359,14 @@  hsa_deinit_data_for_cfun (void)
   delete hsa_allocp_inst_phi;
   delete hsa_allocp_inst_atomic;
   delete hsa_allocp_inst_mem;
+  delete hsa_allocp_inst_signal;
   delete hsa_allocp_inst_seg;
   delete hsa_allocp_inst_cmp;
   delete hsa_allocp_inst_br;
   delete hsa_allocp_inst_call;
   delete hsa_allocp_inst_arg_block;
+  delete hsa_allocp_inst_comment;
+  delete hsa_allocp_inst_queue;
   delete hsa_allocp_bb;
   delete hsa_allocp_symbols;
   delete hsa_cfun;
@@ -569,12 +712,13 @@  hsa_op_with_type::hsa_op_with_type (BrigKind16_t k, BrigType16_t t)
 }
 
 /* Constructor of class representing HSA immediate values.  TREE_VAL is the
-   tree representation of the immediate value. */
+   tree representation of the immediate value.  If min32int is true,
+   always expand integer types to one that has at least 32 bits.  */
 
-hsa_op_immed::hsa_op_immed (tree tree_val)
+hsa_op_immed::hsa_op_immed (tree tree_val, bool min32int)
   : hsa_op_with_type (BRIG_KIND_OPERAND_CONSTANT_BYTES,
 		      hsa_type_for_scalar_tree_type (TREE_TYPE (tree_val),
-						     true))
+						     min32int))
 {
   gcc_checking_assert (is_gimple_min_invariant (tree_val)
 		       && !POINTER_TYPE_P (TREE_TYPE (tree_val)));
@@ -665,21 +809,6 @@  hsa_reg_for_gimple_ssa (tree ssa, vec <hsa_op_reg_p> *ssa_map)
   return hreg;
 }
 
-/* Set the defining instruction of REG to be INSN.  When checking, make sure it
-   was not set before.  */
-
-static inline void
-set_reg_def (hsa_op_reg *reg, hsa_insn_basic *insn)
-{
-  if (hsa_cfun->in_ssa)
-    {
-      gcc_checking_assert (!reg->def_insn);
-      reg->def_insn = insn;
-    }
-  else
-    reg->def_insn = NULL;
-}
-
 /* Constructor of the class which is the bases of all instructions and directly
    represents the most basic ones.  NOPS is the number of operands that the
    operand vector will contain (and which will be cleared).  OP is the opcode
@@ -703,7 +832,9 @@  hsa_insn_basic::hsa_insn_basic (unsigned nops, int opc)
    operand vector will contain (and which will be cleared).  OPC is the opcode
    of the instruction, T is the type of the instruction.  */
 
-hsa_insn_basic::hsa_insn_basic (unsigned nops, int opc, BrigType16_t t)
+hsa_insn_basic::hsa_insn_basic (unsigned nops, int opc, BrigType16_t t,
+				hsa_op_base *arg0, hsa_op_base *arg1,
+				hsa_op_base *arg2)
 {
   opcode = opc;
   type = t;
@@ -714,6 +845,24 @@  hsa_insn_basic::hsa_insn_basic (unsigned nops, int opc, BrigType16_t t)
 
   if (nops > 0)
     operands.safe_grow_cleared (nops);
+
+  if (arg0 != NULL)
+    {
+      gcc_checking_assert (nops >= 1);
+      operands[0] = arg0;
+    }
+
+  if (arg1 != NULL)
+    {
+      gcc_checking_assert (nops >= 2);
+      operands[1] = arg1;
+    }
+
+  if (arg2 != NULL)
+    {
+      gcc_checking_assert (nops >= 3);
+      operands[2] = arg2;
+    }
 }
 
 /* Constructor of an instruction representing a PHI node.  NOPS is the number
@@ -748,13 +897,16 @@  hsa_insn_cmp::hsa_insn_cmp (BrigCompareOperation8_t cmp, BrigType16_t t)
 /* Constructor of classes representing memory accesses.  OPC is the opcode (must
    be BRIG_OPCODE_ST or BRIG_OPCODE_LD) and T is the type.  */
 
-hsa_insn_mem::hsa_insn_mem (int opc, BrigType16_t t)
+hsa_insn_mem::hsa_insn_mem (int opc, BrigType16_t t,
+			    hsa_op_base *arg0, hsa_op_base *arg1)
   : hsa_insn_basic (2, opc, t)
 {
   gcc_checking_assert (opc == BRIG_OPCODE_LD || opc == BRIG_OPCODE_ST);
   equiv_class = 0;
   memoryorder = BRIG_MEMORY_ORDER_NONE;
   memoryscope = BRIG_MEMORY_SCOPE_NONE;
+  operands[0] = arg0;
+  operands[1] = arg1;
 }
 
 /* Constructor for descendants allowing different opcodes and number of
@@ -773,15 +925,30 @@  hsa_insn_mem::hsa_insn_mem (unsigned nops, int opc, BrigType16_t t)
    opcode, aop is the specific atomic operation opcode.  T is the type of the
    instruction.  */
 
-hsa_insn_atomic::hsa_insn_atomic (int opc, enum BrigAtomicOperation aop,
+hsa_insn_atomic::hsa_insn_atomic (int nops, int opc,
+				  enum BrigAtomicOperation aop,
 				  BrigType16_t t)
-  : hsa_insn_mem (4, opc, t)
+  : hsa_insn_mem (nops, opc, t)
 {
   gcc_checking_assert (opc == BRIG_OPCODE_ATOMICNORET ||
-		       opc == BRIG_OPCODE_ATOMIC);
+		       opc == BRIG_OPCODE_ATOMIC ||
+		       opc == BRIG_OPCODE_SIGNAL ||
+		       opc == BRIG_OPCODE_SIGNALNORET);
   atomicop = aop;
 }
 
+/* Constructor of class representing signal instructions.  OPC is the prinicpa;
+   opcode, sop is the specific signal operation opcode.  T is the type of the
+   instruction.  */
+
+hsa_insn_signal::hsa_insn_signal (int nops, int opc,
+				  enum BrigAtomicOperation sop,
+				  BrigType16_t t)
+  : hsa_insn_atomic (nops, opc, sop, t)
+{
+}
+
+
 /* Constructor of class representing segment conversion instructions.  OPC is
    the opcode which must be either BRIG_OPCODE_STOF or BRIG_OPCODE_FTOS.  DESTT
    and SRCT are destination and source types respectively, SEG is the segment
@@ -816,25 +983,32 @@  hsa_insn_arg_block::hsa_insn_arg_block (BrigKind brig_kind,
 {
 }
 
-/* Append HSA instruction INSN to basic block HBB.  */
+hsa_insn_comment::hsa_insn_comment (const char *s)
+  : hsa_insn_basic (0, BRIG_KIND_DIRECTIVE_COMMENT)
+{
+  unsigned l = strlen (s);
 
-static void
-hsa_append_insn (hsa_bb *hbb, hsa_insn_basic *insn)
+  /* Append '// ' to the string.  */
+  char *buf = XNEWVEC (char, l + 4);
+  sprintf (buf, "// %s", s);
+  comment = buf;
+}
+
+void
+hsa_insn_comment::release_string ()
 {
-  /* Make sure we did not forget to set the kind.  */
-  gcc_assert (insn->opcode != 0);
-  gcc_assert (!insn->bb);
+  gcc_checking_assert (comment);
+  free (comment);
+  comment = NULL;
+}
 
-  insn->bb = hbb->bb;
-  insn->prev = hbb->last_insn;
-  insn->next = NULL;
-  if (hbb->last_insn)
-    hbb->last_insn->next = insn;
-  hbb->last_insn = insn;
-  if (!hbb->first_insn)
-    hbb->first_insn = insn;
+/* Constructor of class representing the queue instruction in HSAIL.  */
+hsa_insn_queue::hsa_insn_queue (int nops, BrigOpcode opcode)
+  : hsa_insn_basic (nops, opcode, BRIG_TYPE_U64)
+{
 }
 
+
 /* Insert HSA instruction NEW_INSN immediately before an existing instruction
    OLD_INSN.  */
 
@@ -1896,6 +2070,73 @@  gen_hsa_insns_for_return (greturn *stmt, hsa_bb *hbb,
   hsa_append_insn (hbb, ret);
 }
 
+/* Return unsigned brig type according to provided SIZE in bytes.  */
+
+static BrigType16_t
+get_unsinged_type_by_bytes (unsigned size)
+{
+  switch (size)
+    {
+    case 1:
+      return BRIG_TYPE_U8;
+    case 2:
+      return BRIG_TYPE_U16;
+    case 4:
+      return BRIG_TYPE_U32;
+    case 8:
+      return BRIG_TYPE_U64;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Generate memory copy instructions that are going to be used
+   for copying a HSA symbol SRC to TARGET memory, represented by
+   pointer in a register.  */
+
+static void
+gen_hsa_memory_copy (hsa_bb *hbb, hsa_symbol *src, hsa_op_reg *target)
+{
+  hsa_op_address *addr;
+  hsa_insn_mem *mem;
+
+  gcc_assert (src->type | BRIG_TYPE_U8);
+
+  unsigned size = src->dim;
+  unsigned offset = 0;
+
+  while (size)
+    {
+      unsigned s;
+      if (size >= 8)
+	s = 8;
+      else if (size >= 4)
+	s = 4;
+      else if (size >= 2)
+	s = 2;
+      else
+	s = 1;
+
+      BrigType16_t t = get_unsinged_type_by_bytes (s);
+
+      hsa_op_reg *tmp = new (hsa_allocp_operand_reg) hsa_op_reg (t);
+      addr = new (hsa_allocp_operand_address) hsa_op_address (src, NULL,
+							      offset);
+      mem = new (hsa_allocp_inst_mem) hsa_insn_mem (BRIG_OPCODE_LD, t,
+						    tmp, addr);
+      hsa_append_insn (hbb, mem);
+      set_reg_def (tmp, mem);
+
+      addr = new (hsa_allocp_operand_address) hsa_op_address
+	(NULL, target, offset);
+      mem = new (hsa_allocp_inst_mem) hsa_insn_mem (BRIG_OPCODE_ST, t, tmp,
+						    addr);
+      hsa_append_insn (hbb, mem);
+      offset += s;
+      size -= s;
+    }
+}
+
 /* If STMT is a call of a known library function, generate code to perform
    it and return true.  */
 
@@ -1923,9 +2164,556 @@  gen_hsa_insns_for_known_library_call (gimple stmt, hsa_bb *hbb,
   return false;
 }
 
+/* Generate HSA instructions for the given kernel call statement CALL.
+   Instructions will be appended to HBB.  */
+
+static void
+gen_hsa_insns_for_kernel_call (hsa_bb *hbb, gcall *call)
+{
+  /* TODO: all emitted instructions assume that
+     we run on a LARGE_MODEL agent.  */
+
+  hsa_insn_mem *mem;
+  hsa_op_address *addr;
+  hsa_op_immed *c;
+
+  hsa_op_reg *shadow_reg_ptr = hsa_cfun->get_shadow_reg ();
+
+  /* Get my kernel dispatch argument.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("get kernel dispatch structure"));
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, shadow_reg_ptr,
+			offsetof (hsa_kernel_dispatch, children_dispatches));
+
+  hsa_op_reg *shadow_reg_base_ptr = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U64);
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem (BRIG_OPCODE_LD, BRIG_TYPE_U64,
+						shadow_reg_base_ptr, addr);
+  set_reg_def (shadow_reg_base_ptr, mem);
+  hsa_append_insn (hbb, mem);
+
+  unsigned index = hsa_cfun->kernel_dispatch_count;
+  unsigned byte_offset = index * sizeof (hsa_kernel_dispatch *);
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, shadow_reg_base_ptr, byte_offset);
+
+  hsa_op_reg *shadow_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U64);
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem (BRIG_OPCODE_LD, BRIG_TYPE_U64,
+						shadow_reg, addr);
+  set_reg_def (shadow_reg, mem);
+  hsa_append_insn (hbb, mem);
+
+  /* Emit store to debug argument.  */
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, shadow_reg,
+			offsetof (hsa_kernel_dispatch, debug));
+
+  /* Create a magic number that is going to be printed by libgomp.  */
+  c = new (hsa_allocp_operand_immed)
+    hsa_op_immed (build_int_cstu (uint64_type_node, 1000 + index));
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem (BRIG_OPCODE_ST, BRIG_TYPE_U64,
+						c, addr);
+  hsa_append_insn (hbb, mem);
+
+  /* Load an address of the command queue to a register.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("load base address of command queue"));
+
+  hsa_op_reg *queue_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U64);
+
+  mem = new (hsa_allocp_inst_mem)
+    hsa_insn_mem (BRIG_OPCODE_LD, BRIG_TYPE_U64);
+
+  mem->operands[0] = queue_reg;
+  mem->operands[1] = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, shadow_reg,
+			offsetof (hsa_kernel_dispatch, queue));
+  set_reg_def (queue_reg, mem);
+  hsa_append_insn (hbb, mem);
+
+  /* Load an address of prepared memory for a kernel arguments.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("get a kernarg address"));
+  hsa_op_reg *kernarg_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U64);
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, shadow_reg,
+			offsetof (hsa_kernel_dispatch, kernarg_address));
+
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem (BRIG_OPCODE_LD, BRIG_TYPE_U64,
+						kernarg_reg, addr);
+  set_reg_def (kernarg_reg, mem);
+  hsa_append_insn (hbb, mem);
+
+  /* Load an kernel object we want to call.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("get a kernel object"));
+  hsa_op_reg *object_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U64);
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, shadow_reg,
+			offsetof (hsa_kernel_dispatch, object));
+
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem (BRIG_OPCODE_LD, BRIG_TYPE_U64,
+						object_reg, addr);
+  set_reg_def (object_reg, mem);
+  hsa_append_insn (hbb, mem);
+
+  /* Get signal prepared for the kernel dispatch.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("get a signal by "
+				     "kernel call index"));
+
+  hsa_op_reg *signal_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U64);
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, shadow_reg,
+			offsetof (hsa_kernel_dispatch, signal));
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem (BRIG_OPCODE_LD, BRIG_TYPE_U64,
+						signal_reg, addr);
+  set_reg_def (signal_reg, mem);
+  hsa_append_insn (hbb, mem);
+
+  /* Store to synchronization signal.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("store 1 to signal handle"));
+
+  c = new (hsa_allocp_operand_immed)
+    hsa_op_immed (build_int_cstu (uint64_type_node, 1));
+
+  hsa_insn_signal *signal= new (hsa_allocp_inst_signal)
+    hsa_insn_signal (2, BRIG_OPCODE_SIGNALNORET, BRIG_ATOMIC_ST,
+		     BRIG_TYPE_B64);
+  signal->memoryorder = BRIG_MEMORY_ORDER_RELAXED;
+  signal->memoryscope = BRIG_MEMORY_SCOPE_SYSTEM;
+  signal->operands[0] = signal_reg;
+  signal->operands[1] = c;
+  hsa_append_insn (hbb, signal);
+
+  /* Get private segment size.  */
+  hsa_op_reg *private_seg_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U32);
+
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("get a kernel private segment size by "
+				     "kernel call index"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, shadow_reg,
+			offsetof (hsa_kernel_dispatch, private_segment_size));
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem (BRIG_OPCODE_LD, BRIG_TYPE_U32,
+						private_seg_reg, addr);
+  set_reg_def (private_seg_reg, mem);
+  hsa_append_insn (hbb, mem);
+
+  /* Get group segment size.  */
+  hsa_op_reg *group_seg_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U32);
+
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("get a kernel group segment size by "
+				     "kernel call index"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, shadow_reg,
+			offsetof (hsa_kernel_dispatch, group_segment_size));
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem (BRIG_OPCODE_LD, BRIG_TYPE_U32,
+						group_seg_reg, addr);
+  set_reg_def (group_seg_reg, mem);
+  hsa_append_insn (hbb, mem);
+
+  /* Get a write index to the command queue.  */
+  hsa_op_reg *queue_index_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U64);
+
+  c = new (hsa_allocp_operand_immed)
+    hsa_op_immed (build_int_cstu (uint64_type_node, 1));
+  hsa_insn_queue *queue = new (hsa_allocp_inst_queue)
+    hsa_insn_queue (3, BRIG_OPCODE_ADDQUEUEWRITEINDEX);
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_reg, 0);
+  queue->operands[0] = queue_index_reg;
+  queue->operands[1] = addr;
+  queue->operands[2] = c;
+
+  set_reg_def (queue_index_reg, queue);
+  hsa_append_insn (hbb, queue);
+
+  /* Get packet base address.  */
+  size_t addr_offset = offsetof (hsa_queue, base_address);
+
+  hsa_op_reg *queue_addr_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U64);
+
+  c = new (hsa_allocp_operand_immed)
+    hsa_op_immed (build_int_cstu (uint64_type_node, addr_offset));
+  hsa_insn_basic *insn = new (hsa_allocp_inst_basic)
+    hsa_insn_basic (3, BRIG_OPCODE_ADD, BRIG_TYPE_U64, queue_addr_reg,
+		    queue_reg, c);
+
+  set_reg_def (queue_addr_reg, insn);
+  hsa_append_insn (hbb, insn);
+
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("get base address of prepared packet"));
+
+  hsa_op_reg *queue_addr_value_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U64);
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_addr_reg, 0);
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem (BRIG_OPCODE_LD, BRIG_TYPE_U64,
+						queue_addr_value_reg, addr);
+  set_reg_def (queue_addr_value_reg, mem);
+  hsa_append_insn (hbb, mem);
+
+  c = new (hsa_allocp_operand_immed)
+    hsa_op_immed (build_int_cstu (uint64_type_node,
+				  sizeof (hsa_queue_packet)));
+  hsa_op_reg *queue_packet_offset_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U64);
+  insn = new (hsa_allocp_inst_basic)
+    hsa_insn_basic (3, BRIG_OPCODE_MUL, BRIG_TYPE_U64, queue_packet_offset_reg,
+		    queue_index_reg, c);
+
+  set_reg_def (queue_packet_offset_reg, insn);
+  hsa_append_insn (hbb, insn);
+
+  hsa_op_reg *queue_packet_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U64);
+  insn = new (hsa_allocp_inst_basic)
+    hsa_insn_basic (3, BRIG_OPCODE_ADD, BRIG_TYPE_U64, queue_packet_reg,
+		    queue_addr_value_reg, queue_packet_offset_reg);
+
+  set_reg_def (queue_packet_reg, insn);
+  hsa_append_insn (hbb, insn);
+
+
+  /* Write to packet->setup.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("set packet->setup |= 1"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_packet_reg, offsetof
+			(hsa_queue_packet, setup));
+  hsa_op_reg *packet_setup_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U16);
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem
+    (BRIG_OPCODE_LD, BRIG_TYPE_U16, packet_setup_reg, addr);
+  hsa_append_insn (hbb, mem);
+  set_reg_def (packet_setup_reg, mem);
+
+  hsa_op_reg *packet_setup_u32 = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U32);
+
+  hsa_insn_basic *cvtinsn = new (hsa_allocp_inst_basic)
+    hsa_insn_basic (2, BRIG_OPCODE_CVT, BRIG_TYPE_U32, packet_setup_u32,
+		    packet_setup_reg);
+  hsa_append_insn (hbb, cvtinsn);
+  set_reg_def (packet_setup_u32, cvtinsn);
+
+  hsa_op_reg *packet_setup_u32_2 = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U32);
+  c = new (hsa_allocp_operand_immed)
+    hsa_op_immed (build_int_cstu (uint32_type_node, 1));
+  insn = new (hsa_allocp_inst_basic)
+    hsa_insn_basic (3, BRIG_OPCODE_OR, BRIG_TYPE_U32, packet_setup_u32_2,
+		    packet_setup_u32, c);
+
+  hsa_append_insn (hbb, insn);
+  set_reg_def (packet_setup_u32_2, insn);
+
+  hsa_op_reg *packet_setup_reg_2 = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U16);
+
+  cvtinsn = new (hsa_allocp_inst_basic)
+    hsa_insn_basic (2, BRIG_OPCODE_CVT, BRIG_TYPE_U16, packet_setup_reg_2,
+		    packet_setup_u32_2);
+  hsa_append_insn (hbb, cvtinsn);
+  set_reg_def (packet_setup_reg_2, cvtinsn);
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_packet_reg, offsetof
+			(hsa_queue_packet, setup));
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem
+    (BRIG_OPCODE_ST, BRIG_TYPE_U16, packet_setup_reg_2, addr);
+  hsa_append_insn (hbb, mem);
+
+  /* Write to packet->grid_size_x.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("set packet->grid_size_x = 64"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_packet_reg, offsetof
+			(hsa_queue_packet, grid_size_x));
+  c = new (hsa_allocp_operand_immed) hsa_op_immed
+    (build_int_cstu (uint16_type_node, 64), false);
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem
+    (BRIG_OPCODE_ST, BRIG_TYPE_U16, c, addr);
+  hsa_append_insn (hbb, mem);
+
+  /* Write to packet->grid_size_x.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("set packet->workgroup_size_x = 1"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_packet_reg, offsetof
+			(hsa_queue_packet, workgroup_size_x));
+  c = new (hsa_allocp_operand_immed)
+    hsa_op_immed (build_int_cstu (uint16_type_node, 1), false);
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem
+    (BRIG_OPCODE_ST, BRIG_TYPE_U16, c, addr);
+  hsa_append_insn (hbb, mem);
+
+  /* Write to packet->grid_size_y.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("set packet->grid_size_y = 1"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_packet_reg, offsetof
+			(hsa_queue_packet, grid_size_y));
+  c = new (hsa_allocp_operand_immed)
+    hsa_op_immed (build_int_cstu (uint16_type_node, 1), false);
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem
+    (BRIG_OPCODE_ST, BRIG_TYPE_U16, c, addr);
+  hsa_append_insn (hbb, mem);
+
+  /* Write to packet->grid_size_y.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("set packet->workgroup_size_y = 1"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_packet_reg, offsetof
+			(hsa_queue_packet, workgroup_size_y));
+  c = new (hsa_allocp_operand_immed)
+    hsa_op_immed (build_int_cstu (uint16_type_node, 1), false);
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem
+    (BRIG_OPCODE_ST, BRIG_TYPE_U16, c, addr);
+  hsa_append_insn (hbb, mem);
+
+  /* Write to packet->grid_size_z.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("set packet->grid_size_z = 1"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_packet_reg, offsetof
+			(hsa_queue_packet, grid_size_z));
+  c = new (hsa_allocp_operand_immed)
+    hsa_op_immed (build_int_cstu (uint16_type_node, 1), false);
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem
+    (BRIG_OPCODE_ST, BRIG_TYPE_U16, c, addr);
+  hsa_append_insn (hbb, mem);
+
+  /* Write to packet->grid_size_z.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("set packet->workgroup_size_z = 1"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_packet_reg, offsetof
+			(hsa_queue_packet, workgroup_size_z));
+  c = new (hsa_allocp_operand_immed)
+    hsa_op_immed (build_int_cstu (uint16_type_node, 1), false);
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem
+    (BRIG_OPCODE_ST, BRIG_TYPE_U16, c, addr);
+  hsa_append_insn (hbb, mem);
+
+  /* Write to packet->private_segment_size.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("set packet->private_segment_size"));
+
+  hsa_op_reg *private_seg_reg_u16 = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U16);
+
+  cvtinsn = new (hsa_allocp_inst_basic)
+    hsa_insn_basic (2, BRIG_OPCODE_CVT, BRIG_TYPE_U16, private_seg_reg_u16,
+		    private_seg_reg);
+  hsa_append_insn (hbb, cvtinsn);
+  set_reg_def (private_seg_reg_u16, cvtinsn);
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_packet_reg, offsetof
+			(hsa_queue_packet, private_segment_size));
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem
+    (BRIG_OPCODE_ST, BRIG_TYPE_U16, private_seg_reg_u16, addr);
+  hsa_append_insn (hbb, mem);
+
+  /* Write to packet->group_segment_size.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("set packet->group_segment_size"));
+
+  hsa_op_reg *group_seg_reg_u16 = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U16);
+
+  cvtinsn = new (hsa_allocp_inst_basic)
+    hsa_insn_basic (2, BRIG_OPCODE_CVT, BRIG_TYPE_U16, group_seg_reg_u16,
+		    group_seg_reg);
+  hsa_append_insn (hbb, cvtinsn);
+  set_reg_def (group_seg_reg_u16, cvtinsn);
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_packet_reg, offsetof
+			(hsa_queue_packet, group_segment_size));
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem
+    (BRIG_OPCODE_ST, BRIG_TYPE_U16, group_seg_reg_u16, addr);
+  hsa_append_insn (hbb, mem);
+
+  /* Write to packet->kernel_object.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("set packet->kernel_object"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_packet_reg, offsetof
+			(hsa_queue_packet, kernel_object));
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem
+    (BRIG_OPCODE_ST, BRIG_TYPE_U64, object_reg, addr);
+  hsa_append_insn (hbb, mem);
+
+  /* Copy locally allocated memory for arguments to a prepared one.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("get address of omp data memory"));
+
+  hsa_op_reg *omp_data_memory_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U64);
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, shadow_reg,
+			offsetof (hsa_kernel_dispatch, omp_data_memory));
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem (BRIG_OPCODE_LD, BRIG_TYPE_U64,
+						omp_data_memory_reg, addr);
+  set_reg_def (omp_data_memory_reg, mem);
+  hsa_append_insn (hbb, mem);
+
+  tree argument = gimple_call_arg (call, 1);
+  gcc_assert (TREE_CODE (argument) == ADDR_EXPR);
+  hsa_symbol *var_decl = get_symbol_for_decl (TREE_OPERAND (argument, 0));
+
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("memory copy instructions"));
+  gen_hsa_memory_copy (hbb, var_decl, omp_data_memory_reg);
+
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("write memory pointer to "
+				     "packet->kernarg_address"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_packet_reg, offsetof
+			(hsa_queue_packet, kernarg_address));
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem
+    (BRIG_OPCODE_ST, BRIG_TYPE_U64, kernarg_reg, addr);
+  hsa_append_insn (hbb, mem);
+
+  /* Write to packet->kernarg_address.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("write argument0 to "
+				     "*packet->kernarg_address"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, kernarg_reg, 0);
+
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem (BRIG_OPCODE_ST, BRIG_TYPE_U64);
+  mem->operands[0] = omp_data_memory_reg;
+  mem->operands[1] = addr;
+  hsa_append_insn (hbb, mem);
+
+  /* Pass shadow argument to another dispatched kernel module.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("write argument1 to "
+				     "*packet->kernarg_address"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, kernarg_reg, 8);
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem (BRIG_OPCODE_ST, BRIG_TYPE_U64,
+						shadow_reg, addr);
+  hsa_append_insn (hbb, mem);
+
+  /* Write to packet->competion_signal.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("set packet->completion_signal"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_packet_reg, offsetof
+			(hsa_queue_packet, completion_signal));
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem
+    (BRIG_OPCODE_ST, BRIG_TYPE_U64, signal_reg, addr);
+  hsa_append_insn (hbb, mem);
+
+  /* Atomically write to packer->header.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("store atomically to packet->header"));
+
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_packet_reg, offsetof
+			(hsa_queue_packet, header));
+
+  /* Store 5122 << 16 + 1 to packet->header.  */
+  c = new (hsa_allocp_operand_immed)
+    hsa_op_immed (build_int_cstu (uint32_type_node, 70658));
+
+  hsa_insn_atomic *atomic = new (hsa_allocp_inst_atomic)
+    hsa_insn_atomic (2, BRIG_OPCODE_ATOMICNORET, BRIG_ATOMIC_ST, BRIG_TYPE_B32);
+  atomic->memoryorder = BRIG_MEMORY_ORDER_SC_RELEASE;
+  atomic->memoryscope = BRIG_MEMORY_SCOPE_SYSTEM;
+  atomic->operands[0] = addr;
+  atomic->operands[1] = c;
+
+  hsa_append_insn (hbb, atomic);
+
+  /* Ring doorbell signal.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("store index to doorbell signal"));
+
+  hsa_op_reg *doorbell_signal_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U64);
+  addr = new (hsa_allocp_operand_address)
+	hsa_op_address (NULL, queue_reg, offsetof
+			(hsa_queue, doorbell_signal));
+  mem = new (hsa_allocp_inst_mem) hsa_insn_mem
+    (BRIG_OPCODE_LD, BRIG_TYPE_U64, doorbell_signal_reg, addr);
+  hsa_append_insn (hbb, mem);
+
+  signal = new (hsa_allocp_inst_signal)
+    hsa_insn_signal (2, BRIG_OPCODE_SIGNALNORET, BRIG_ATOMIC_ST,
+		     BRIG_TYPE_B64);
+  signal->memoryorder = BRIG_MEMORY_ORDER_SC_RELEASE;
+  signal->memoryscope = BRIG_MEMORY_SCOPE_SYSTEM;
+  signal->operands[0] = doorbell_signal_reg;
+  signal->operands[1] = queue_index_reg;
+  hsa_append_insn (hbb, signal);
+
+  /* Emit blocking signal waiting instruction.  */
+  hsa_append_insn (hbb, new (hsa_allocp_inst_comment)
+		   hsa_insn_comment ("wait for the signal"));
+
+  hsa_op_reg *signal_result_reg = new (hsa_allocp_operand_reg)
+    hsa_op_reg (BRIG_TYPE_U64);
+  c = new (hsa_allocp_operand_immed)
+    hsa_op_immed (build_int_cst (long_integer_type_node, 1));
+  hsa_op_immed *c2 = new (hsa_allocp_operand_immed) hsa_op_immed
+    (build_int_cst (uint64_type_node, UINT64_MAX));
+
+  signal = new (hsa_allocp_inst_signal)
+    hsa_insn_signal (4, BRIG_OPCODE_SIGNAL, BRIG_ATOMIC_WAITTIMEOUT_LT,
+		     BRIG_TYPE_S64);
+  signal->memoryorder = BRIG_MEMORY_ORDER_SC_ACQUIRE;
+  signal->memoryscope = BRIG_MEMORY_SCOPE_SYSTEM;
+  signal->operands[0] = signal_result_reg;
+  signal->operands[1] = signal_reg;
+  signal->operands[2] = c;
+  signal->operands[3] = c2;
+  hsa_append_insn (hbb, signal);
+
+  hsa_cfun->kernel_dispatch_count++;
+}
+
 /* Generate HSA instructions for the given call statement STMT.  Instructions
    will be appended to HBB.  SSA_MAP maps gimple SSA names to HSA pseudo
-   registers. */
+   registers.  */
 
 static void
 gen_hsa_insns_for_call (gimple stmt, hsa_bb *hbb,
@@ -1947,7 +2735,8 @@  gen_hsa_insns_for_call (gimple stmt, hsa_bb *hbb,
       return;
     }
 
-  switch (DECL_FUNCTION_CODE (gimple_call_fndecl (stmt)))
+  tree fndecl = gimple_call_fndecl (stmt);
+  switch (DECL_FUNCTION_CODE (fndecl))
     {
     case BUILT_IN_OMP_GET_THREAD_NUM:
       opcode = BRIG_OPCODE_WORKITEMABSID;
@@ -2044,7 +2833,7 @@  specialop:
 	BrigType16_t atype  = hsa_bittype_for_type
 	  (hsa_type_for_scalar_tree_type (TREE_TYPE (lhs), false));
 	hsa_insn_atomic *atominsn = new (hsa_allocp_inst_atomic)
-	  hsa_insn_atomic (BRIG_OPCODE_ATOMIC, BRIG_ATOMIC_CAS, atype);
+	  hsa_insn_atomic (4, BRIG_OPCODE_ATOMIC, BRIG_ATOMIC_CAS, atype);
 	hsa_op_address *addr;
 	addr = gen_hsa_addr (gimple_call_arg (stmt, 0), hbb, ssa_map);
 	dest = hsa_reg_for_gimple_ssa (lhs, ssa_map);
@@ -2067,7 +2856,21 @@  specialop:
 	hsa_append_insn (hbb, atominsn);
 	break;
       }
+    case BUILT_IN_GOMP_PARALLEL:
+      {
+	gcc_checking_assert (gimple_call_num_args (stmt) == 4);
+	tree called = gimple_call_arg (stmt, 0);
+	gcc_checking_assert (TREE_CODE (called) == ADDR_EXPR);
+	called = TREE_OPERAND (called, 0);
+	gcc_checking_assert (TREE_CODE (called) == FUNCTION_DECL);
+
+	const char *name = get_declaration_name (called);
+	hsa_add_kernel_dependency (hsa_cfun->decl,
+				   hsa_brig_function_name (name));
+	gen_hsa_insns_for_kernel_call (hbb, as_a <gcall *> (stmt));
 
+	break;
+      }
     default:
       sorry ("Support for HSA does not implement calls to builtin %D",
 	     gimple_call_fndecl (stmt));
@@ -2321,7 +3124,11 @@  gen_function_def_parameters (hsa_function_representation *f,
   f->prologue.bb = ENTRY_BLOCK_PTR_FOR_FN (cfun);
 
   f->input_args_count = count;
-  f->input_args = XCNEWVEC (hsa_symbol, f->input_args_count);
+
+  /* Allocate one more argument which can be potentially used for a kernel
+     dispatching.  */
+  f->input_args = XCNEWVEC (hsa_symbol, f->input_args_count + 1);
+
   for (parm = DECL_ARGUMENTS (cfun->decl), i = 0;
        parm;
        parm = DECL_CHAIN (parm), i++)
diff --git a/gcc/hsa-regalloc.c b/gcc/hsa-regalloc.c
index bc2d227..68551601 100644
--- a/gcc/hsa-regalloc.c
+++ b/gcc/hsa-regalloc.c
@@ -267,6 +267,7 @@  hsa_num_def_ops (hsa_insn_basic *insn)
 
       case BRIG_OPCODE_SIGNAL:
 	return 1;
+
       case BRIG_OPCODE_SIGNALNORET:
 	return 0;
 
@@ -328,7 +329,6 @@  hsa_num_def_ops (hsa_insn_basic *insn)
 
       case BRIG_OPCODE_PACKETCOMPLETIONSIG:
       case BRIG_OPCODE_PACKETID:
-      case BRIG_OPCODE_ADDQUEUEWRITEINDEX:
       case BRIG_OPCODE_CASQUEUEWRITEINDEX:
       case BRIG_OPCODE_LDQUEUEREADINDEX:
       case BRIG_OPCODE_LDQUEUEWRITEINDEX:
@@ -336,6 +336,9 @@  hsa_num_def_ops (hsa_insn_basic *insn)
       case BRIG_OPCODE_STQUEUEWRITEINDEX:
 	return 1; /* ??? */
 
+      case BRIG_OPCODE_ADDQUEUEWRITEINDEX:
+	return 1;
+
       case BRIG_OPCODE_DEBUGTRAP:
 	return 0;
 
@@ -345,6 +348,9 @@  hsa_num_def_ops (hsa_insn_basic *insn)
 
       case HSA_OPCODE_ARG_BLOCK:
 	return 0;
+
+      case BRIG_KIND_DIRECTIVE_COMMENT:
+	return 0;
     }
 }
 
diff --git a/gcc/hsa.c b/gcc/hsa.c
index 730236b..2c02a82 100644
--- a/gcc/hsa.c
+++ b/gcc/hsa.c
@@ -91,6 +91,10 @@  struct GTY(()) hsa_decl_kernel_map_element
 
 static GTY (()) vec<hsa_decl_kernel_map_element, va_gc> *hsa_decl_kernel_mapping;
 
+/* Mapping between decls and corresponding HSA kernels
+   called by the function.  */
+hash_map <tree, vec <char *> *> *hsa_decl_kernel_dependencies;
+
 /* Hash function to lookup a symbol for a decl.  */
 hash_table <hsa_free_symbol_hasher> *hsa_global_variable_symbols;
 
@@ -360,16 +364,53 @@  hsa_free_decl_kernel_mapping (void)
   ggc_free (hsa_decl_kernel_mapping);
 }
 
+/* Add new kernel dependency.  */
+
+void
+hsa_add_kernel_dependency (tree caller, char *called_function)
+{
+  if (hsa_decl_kernel_dependencies == NULL)
+    hsa_decl_kernel_dependencies = new hash_map<tree, vec<char *> *> ();
+
+  vec <char *> *s = NULL;
+  vec <char *> **slot = hsa_decl_kernel_dependencies->get (caller);
+  if (slot == NULL)
+    {
+      s = new vec <char *> ();
+      hsa_decl_kernel_dependencies->put (caller, s);
+    }
+  else
+    s = *slot;
+
+  s->safe_push (called_function);
+}
+
 /* Modify the name P in-place so that it is a valid HSA identifier.  */
 
 void
 hsa_sanitize_name (char *p)
 {
   for (; *p; p++)
-    if (*p == '.')
+    if (*p == '.' || *p == '-')
       *p = '_';
 }
 
+/* Clone the name P, set trailing ampersand and sanitize the name.  */
+
+char *
+hsa_brig_function_name (const char *p)
+{
+  unsigned len = strlen (p);
+  char *buf = XNEWVEC (char, len + 2);
+
+  buf[0] = '&';
+  buf[len + 1] = '\0';
+  memcpy (buf + 1, p, len);
+
+  hsa_sanitize_name (buf);
+  return buf;
+}
+
 /* Return declaration name if exists.  */
 
 const char *
diff --git a/gcc/hsa.h b/gcc/hsa.h
index 9f2a018..03c4e50 100644
--- a/gcc/hsa.h
+++ b/gcc/hsa.h
@@ -109,7 +109,7 @@  private:
 class hsa_op_immed : public hsa_op_with_type
 {
 public:
-  hsa_op_immed (tree tree_val);
+  hsa_op_immed (tree tree_val, bool min32int = true);
 
   /* Value as represented by middle end.  */
   tree value;
@@ -286,7 +286,10 @@  class hsa_insn_basic
 {
 public:
   hsa_insn_basic (unsigned nops, int opc);
-  hsa_insn_basic (unsigned nops, int opc, BrigType16_t t);
+  hsa_insn_basic (unsigned nops, int opc, BrigType16_t t,
+		  hsa_op_base *arg0 = NULL,
+		  hsa_op_base *arg1 = NULL,
+		  hsa_op_base *arg2 = NULL);
 
   /* The previous and next instruction in the basic block.  */
   hsa_insn_basic *prev, *next;
@@ -412,7 +415,9 @@  is_a_helper <hsa_insn_cmp *>::test (hsa_insn_basic *p)
 class hsa_insn_mem : public hsa_insn_basic
 {
 public:
-  hsa_insn_mem (int opc, BrigType16_t t);
+  hsa_insn_mem (int opc, BrigType16_t t,
+		hsa_op_base *arg0 = NULL,
+		hsa_op_base *arg1 = NULL);
 
   /* The segment is of the memory access is either the segment of the symbol in
      the address operand or flat address is there is no symbol there.  */
@@ -454,7 +459,8 @@  is_a_helper <hsa_insn_mem *>::test (hsa_insn_basic *p)
 class hsa_insn_atomic : public hsa_insn_mem
 {
 public:
-  hsa_insn_atomic (int opc, enum BrigAtomicOperation aop, BrigType16_t t);
+  hsa_insn_atomic (int nops, int opc, enum BrigAtomicOperation aop,
+		   BrigType16_t t);
 
   /* The operation itself.  */
   enum BrigAtomicOperation atomicop;
@@ -466,7 +472,7 @@  private:
   void operator delete (void *) {}
 };
 
-/* Report whether or not P is a memory instruction.  */
+/* Report whether or not P is an atomic instruction.  */
 
 template <>
 template <>
@@ -477,6 +483,31 @@  is_a_helper <hsa_insn_atomic *>::test (hsa_insn_basic *p)
 	  || p->opcode == BRIG_OPCODE_ATOMICNORET);
 }
 
+/* HSA instruction for signal operations.  */
+
+class hsa_insn_signal : public hsa_insn_atomic
+{
+public:
+  hsa_insn_signal (int nops, int opc, enum BrigAtomicOperation sop,
+		   BrigType16_t t);
+
+private:
+  /* All objects are deallocated by destroying their pool, so make delete
+     inaccessible too.  */
+  void operator delete (void *) {}
+};
+
+/* Report whether or not P is a signal instruction.  */
+
+template <>
+template <>
+inline bool
+is_a_helper <hsa_insn_signal *>::test (hsa_insn_basic *p)
+{
+  return (p->opcode == BRIG_OPCODE_SIGNAL
+	  || p->opcode == BRIG_OPCODE_SIGNALNORET);
+}
+
 /* HSA instruction to convert between flat addressing and segments.  */
 
 class hsa_insn_seg : public hsa_insn_basic
@@ -591,6 +622,54 @@  is_a_helper <hsa_insn_arg_block *>::test (hsa_insn_basic *p)
   return (p->opcode == HSA_OPCODE_ARG_BLOCK);
 }
 
+/* HSA comment instruction.  */
+
+class hsa_insn_comment: public hsa_insn_basic
+{
+public:
+  /* Constructor of class representing the comment in HSAIL.  */
+  hsa_insn_comment (const char *s);
+
+  /* Destructor.  */
+  ~hsa_insn_comment ();
+
+  /* Release memory for comment.  */
+  void release_string ();
+
+  char *comment;
+};
+
+/* Report whether or not P is a call block instruction.  */
+
+template <>
+template <>
+inline bool
+is_a_helper <hsa_insn_comment *>::test (hsa_insn_basic *p)
+{
+  return (p->opcode == BRIG_KIND_DIRECTIVE_COMMENT);
+}
+
+/* HSA queue instruction.  */
+
+class hsa_insn_queue: public hsa_insn_basic
+{
+public:
+  hsa_insn_queue (int nops, BrigOpcode opcode);
+
+  /* Destructor.  */
+  ~hsa_insn_queue ();
+};
+
+/* Report whether or not P is a call block instruction.  */
+
+template <>
+template <>
+inline bool
+is_a_helper <hsa_insn_queue *>::test (hsa_insn_basic *p)
+{
+  return (p->opcode == BRIG_OPCODE_ADDQUEUEWRITEINDEX);
+}
+
 /* Basic block of HSA instructions.  */
 
 class hsa_bb
@@ -689,6 +768,9 @@  public:
   hsa_function_representation ();
   ~hsa_function_representation ();
 
+  /* Builds a shadow register that is utilized to a kernel dispatch.  */
+  hsa_op_reg *get_shadow_reg ();
+
   /* Name of the function.  */
   char *name;
 
@@ -729,11 +811,19 @@  public:
 
   /* Function declaration tree.  */
   tree decl;
+
+  /* Runtime shadow register.  */
+  hsa_op_reg *shadow_reg;
+
+  /* Number of kernel dispatched which take place in the function.  */
+  unsigned kernel_dispatch_count;
 };
 
 /* in hsa.c */
 extern struct hsa_function_representation *hsa_cfun;
 extern hash_table <hsa_free_symbol_hasher> *hsa_global_variable_symbols;
+extern hash_map <tree, vec <char *> *> *hsa_decl_kernel_dependencies;
+extern unsigned hsa_kernel_calls_counter;
 void hsa_init_compilation_unit_data (void);
 void hsa_deinit_compilation_unit_data (void);
 bool hsa_machine_large_p (void);
@@ -748,7 +838,9 @@  unsigned hsa_get_number_decl_kernel_mappings (void);
 tree hsa_get_decl_kernel_mapping_decl (unsigned i);
 char *hsa_get_decl_kernel_mapping_name (unsigned i);
 void hsa_free_decl_kernel_mapping (void);
+void hsa_add_kernel_dependency (tree caller, char *called_function);
 void hsa_sanitize_name (char *p);
+char *hsa_brig_function_name (const char *p);
 const char *get_declaration_name (tree decl);
 
 /* In hsa-gen.c.  */
diff --git a/libgomp/hsa-traits.h b/libgomp/hsa-traits.h
new file mode 100644
index 0000000..3b20008
--- /dev/null
+++ b/libgomp/hsa-traits.h
@@ -0,0 +1,52 @@ 
+/* HSA traits.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef HSA_TRAITS_H_
+#define HSA_TRAITS_H_
+
+/* HSA kernel dispatch is collection of informations needed for
+   a kernel dispatch.  */
+
+struct hsa_kernel_dispatch
+{
+  /* Pointer to a command queue associated with a kernel dispatch agent.  */
+  void *queue;
+  /* Pointer to reserved memory for OMP data struct copying.  */
+  void *omp_data_memory;
+  /* Pointer to a memory space used for kernel arguments passing.  */
+  void *kernarg_address;
+  /* Kernel object.  */
+  uint64_t object;
+  /* Synchronization signal used for dispatch synchronization.  */
+  uint64_t signal;
+  /* Private segment size.  */
+  uint32_t private_segment_size;
+  /* Group segment size.  */
+  uint32_t group_segment_size;
+  /* Number of children kernel dispatches.  */
+  uint64_t kernel_dispatch_count;
+  /* Debug purpose argument.  */
+  uint64_t debug;
+  /* Kernel dispatch structures created for children kernel dispatches.  */
+  struct hsa_kernel_dispatch **children_dispatches;
+};
+
+#endif // HSA_TRAITS_H_
diff --git a/libgomp/plugin/plugin-hsa.c b/libgomp/plugin/plugin-hsa.c
index 4b5dc3b..c5af705 100644
--- a/libgomp/plugin/plugin-hsa.c
+++ b/libgomp/plugin/plugin-hsa.c
@@ -4,6 +4,7 @@ 
 #include <pthread.h>
 #include "libgomp-plugin.h"
 #include "hsa.h"
+#include "hsa-traits.h"
 #include "hsa_ext_finalize.h"
 
 /* Part of the libgomp plugin interface.  Return the name of the accelerator,
@@ -66,6 +67,7 @@  struct brig_image_desc
 {
   hsa_ext_module_t brig_module;
   const char *names;
+  const char *kernel_dependencies;
 };
 
 struct agent_info;
@@ -79,6 +81,8 @@  struct kernel_info
   /* The specific agent the kernel has been or will be finalized for and run
      on.  */
   struct agent_info *agent;
+  /* The specific module where the kernel takes place.  */
+  struct module_info *module;
   /* Mutex enforcing that at most once thread ever initializes a kernel for
      use.  A thread should have locked agent->modules_rwlock for reading before
      acquiring it.  */
@@ -94,6 +98,10 @@  struct kernel_info
   uint32_t group_segment_size;
   /* Required size of private segment.  */
   uint32_t private_segment_size;
+  /* List of all kernel dependencies.  */
+  const char **dependencies;
+  /* Number of dependencies.  */
+  unsigned dependencies_count;
 };
 
 /* Information about a particular brig module, its image and kernels.  */
@@ -107,6 +115,8 @@  struct module_info
 
   /* Number of kernels in this module.  */
   int kernel_count;
+  /* Number of kernel from kernel dispatches.  */
+  unsigned kernel_dispatch_count;
   /* An array of kernel_info structures describing each kernel in this
      module.  */
   struct kernel_info kernels[];
@@ -164,6 +174,19 @@  struct hsa_context_info
 
 static struct hsa_context_info hsa_context;
 
+/* Find kernel in MODULE by name provided in KERNEL_NAME.  */
+
+static struct kernel_info *
+get_kernel_in_module (struct module_info *module, const char *kernel_name)
+{
+  for (unsigned i = 0; i < module->kernel_count; i++)
+    if (strcmp (module->kernels[i].name, kernel_name) == 0)
+      return &module->kernels[i];
+
+  GOMP_PLUGIN_fatal ("Could not find kernel dependency: %s\n", kernel_name);
+  return NULL;
+}
+
 /* Return true if the agent is a GPU and acceptable of concurrent submissions
    from different threads.  */
 
@@ -451,12 +474,15 @@  GOMP_OFFLOAD_load_image (int ord, void *target_data,
 
   p = image_desc->names;
   kernel = &module->kernels[0];
+
+  /* Parse all kernels.  */
   while (*p)
     {
       pair->start = (uintptr_t) kernel;
       pair->end = (uintptr_t) (kernel + 1);
       kernel->name = p;
       kernel->agent = agent;
+      kernel->module = module;
       if (pthread_mutex_init (&kernel->init_mutex, NULL))
 	GOMP_PLUGIN_fatal ("Failed to initialize an HSA kernel mutex");
       kernel++;
@@ -467,6 +493,64 @@  GOMP_OFFLOAD_load_image (int ord, void *target_data,
       p++;
     }
 
+  /* Load length of kernel dependencies.  */
+  p = image_desc->kernel_dependencies;
+  unsigned i = 0;
+  do
+    {
+      while (*p != 0)
+	{
+	  if (*p == '.')
+	    p += 2;
+	  else
+	    {
+	      module->kernels[i].dependencies_count++;
+	      module->kernel_dispatch_count++;
+
+	      do
+		p++;
+	      while (*p);
+	      p++;
+	    }
+	}
+
+      i++;
+      p++;
+    }
+  while (i <= kernel_count - 1);
+
+  /* Allocate memory for kernel dependencies.  */
+  for (unsigned i = 0; i < kernel_count; i++)
+    module->kernels[i].dependencies = GOMP_PLUGIN_malloc
+      (sizeof (char *) * module->kernels[i].dependencies_count);
+
+  /* Parse all kernel dependencies.  */
+  p = image_desc->kernel_dependencies;
+  i = 0;
+
+  do
+    {
+      int j = 0;
+      while (*p != 0)
+	{
+	  if (*p == '.')
+	    p += 2;
+	  else
+	    {
+	      module->kernels[i].dependencies[j++] = p;
+
+	      do
+		p++;
+	      while (*p);
+	      p++;
+	    }
+	}
+
+      i++;
+      p++;
+    }
+  while (i <= kernel_count - 1);
+
   add_module_to_agent (agent, module);
   if (pthread_rwlock_unlock (&agent->modules_rwlock))
     GOMP_PLUGIN_fatal ("Unable to unlock an HSA agent rwlock");
@@ -545,23 +629,75 @@  create_and_finalize_hsa_program (struct agent_info *agent)
     GOMP_PLUGIN_fatal ("Could not unlock an HSA agent program mutex");
 }
 
-/* Do all the work that is necessary before running KERNEL for the first time.
-   The function assumes the program has been created, finalized and frozen by
-   create_and_finalize_hsa_program.  */
+/* Create kernel dispatch data structure for given KERNEL.  */
+
+static struct hsa_kernel_dispatch *
+create_kernel_dispatch (struct kernel_info *kernel)
+{
+  struct agent_info *agent = kernel->agent;
+  struct hsa_kernel_dispatch *shadow = GOMP_PLUGIN_malloc_cleared
+    (sizeof (struct hsa_kernel_dispatch));
+
+  shadow->queue = agent->command_q;
+
+  /* Compute right size needed for memory allocation.  */
+  shadow->omp_data_memory = GOMP_PLUGIN_malloc (100);
+
+  unsigned dispatch_count = kernel->dependencies_count;
+  shadow->kernel_dispatch_count = dispatch_count;
+
+  shadow->children_dispatches = GOMP_PLUGIN_malloc
+    (dispatch_count * sizeof (struct hsa_kernel_dispatch *));
+
+  shadow->object = kernel->object;
+
+  hsa_signal_t sync_signal;
+  hsa_status_t status = hsa_signal_create (1, 0, NULL, &sync_signal);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Error creating the HSA sync signal", status);
+
+  shadow->signal = sync_signal.handle;
+  shadow->private_segment_size = kernel->private_segment_size;
+  shadow->group_segment_size = kernel->group_segment_size;
+
+  status = hsa_memory_allocate
+    (agent->kernarg_region, kernel->kernarg_segment_size,
+     &shadow->kernarg_address);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not allocate memory for HSA kernel arguments", status);
+
+  return shadow;
+}
+
+/* Release data structure created for a kernel dispatch in SHADOW argument.  */
 
 static void
-init_kernel (struct kernel_info *kernel)
+release_kernel_dispatch (struct hsa_kernel_dispatch *shadow)
 {
-  if (pthread_mutex_lock (&kernel->init_mutex))
-    GOMP_PLUGIN_fatal ("Could not lock an HSA kernel initialization mutex");
-  if (kernel->initialized)
-    {
-      if (pthread_mutex_unlock (&kernel->init_mutex))
-	GOMP_PLUGIN_fatal ("Could not unlock an HSA kernel initialization "
-			   "mutex");
-      return;
-    }
+  if (debug)
+    fprintf (stderr, "Released kernel dispatch: %p has value: %lu (%p)\n",
+	     shadow, shadow->debug, (void *)shadow->debug);
+
+  hsa_memory_free (shadow->kernarg_address);
+
+  hsa_signal_t s;
+  s.handle = shadow->signal;
+  hsa_signal_destroy (s);
+
+  free (shadow->omp_data_memory);
+
+  for (unsigned i = 0; i < shadow->kernel_dispatch_count; i++)
+    release_kernel_dispatch (shadow->children_dispatches[i]);
 
+  free (shadow->children_dispatches);
+  free (shadow);
+}
+
+/* Initialize a KERNEL without its dependencies.  */
+
+static void
+init_single_kernel (struct kernel_info *kernel)
+{
   hsa_status_t status;
   struct agent_info *agent = kernel->agent;
   hsa_executable_symbol_t kernel_symbol;
@@ -593,7 +729,8 @@  init_kernel (struct kernel_info *kernel)
 
   if (debug)
     {
-      fprintf (stderr, "Kernel structure for %s fully initialized\n",
+      fprintf (stderr, "Kernel structure for %s fully initialized with "
+	       "following segment sizes: \n",
 	       kernel->name);
       fprintf (stderr, "  group_segment_size: %u\n",
 	       (unsigned) kernel->group_segment_size);
@@ -602,6 +739,103 @@  init_kernel (struct kernel_info *kernel)
       fprintf (stderr, "  kernarg_segment_size: %u\n",
 	       (unsigned) kernel->kernarg_segment_size);
     }
+}
+
+/* Indent stream F by INDENT spaces.  */
+
+static void
+indent_stream (FILE *f, unsigned indent)
+{
+  for (int i = 0; i < indent; i++)
+    fputc (' ', f);
+}
+
+/* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
+
+static void
+print_kernel_dispatch (struct hsa_kernel_dispatch *dispatch, unsigned indent)
+{
+  indent_stream (stderr, indent);
+  fprintf (stderr, "this: %p\n", dispatch);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "queue: %p\n", dispatch->queue);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "omp_data_memory: %p\n", dispatch->omp_data_memory);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "kernarg_address: %p\n", dispatch->kernarg_address);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "object: %lu\n", dispatch->object);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "signal: %lu\n", dispatch->signal);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "private_segment_size: %u\n",
+	   dispatch->private_segment_size);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "group_segment_size: %u\n",
+	   dispatch->group_segment_size);
+  indent_stream (stderr, indent);
+  fprintf (stderr, "children dispatches: %lu\n",
+	   dispatch->kernel_dispatch_count);
+  fprintf (stderr, "\n");
+
+  for (unsigned i = 0; i < dispatch->kernel_dispatch_count; i++)
+      print_kernel_dispatch (dispatch->children_dispatches[i], indent + 2);
+}
+
+/* Create kernel dispatch data structure for a KERNEL and all its
+   dependencies.  */
+
+static struct hsa_kernel_dispatch *
+create_kernel_dispatch_recursive (struct kernel_info *kernel)
+{
+  // TODO: find correct module
+  struct module_info *module = kernel->agent->first_module;
+  struct hsa_kernel_dispatch *shadow = create_kernel_dispatch (kernel);
+  shadow->debug = 0;
+
+  for (unsigned i = 0; i < kernel->dependencies_count; i++)
+    {
+      struct kernel_info *dependency = get_kernel_in_module
+	(module, kernel->dependencies[i]);
+      shadow->children_dispatches[i] = create_kernel_dispatch_recursive
+	(dependency);
+    }
+
+  return shadow;
+}
+
+/* Do all the work that is necessary before running KERNEL for the first time.
+   The function assumes the program has been created, finalized and frozen by
+   create_and_finalize_hsa_program.  */
+
+void
+init_kernel (struct kernel_info *kernel)
+{
+  if (pthread_mutex_lock (&kernel->init_mutex))
+    GOMP_PLUGIN_fatal ("Could not lock an HSA kernel initialization mutex");
+  if (kernel->initialized)
+    {
+      if (pthread_mutex_unlock (&kernel->init_mutex))
+	GOMP_PLUGIN_fatal ("Could not unlock an HSA kernel initialization "
+			   "mutex");
+      return;
+    }
+
+  init_single_kernel (kernel);
+
+  struct agent_info *agent = kernel->agent;
+  struct module_info *module = agent->first_module;
+
+  for (unsigned i = 0; i < kernel->dependencies_count; i++)
+    {
+      struct kernel_info *dependency = get_kernel_in_module
+	(module, kernel->dependencies[i]);
+      init_single_kernel (dependency);
+    }
+
+  if (debug)
+    fprintf (stderr, "\n");
+
   kernel->initialized = true;
   if (pthread_mutex_unlock (&kernel->init_mutex))
     GOMP_PLUGIN_fatal ("Could not unlock an HSA kernel initialization "
@@ -622,18 +856,14 @@  GOMP_OFFLOAD_run (int n, void *fn_ptr, void *vars)
 
   create_and_finalize_hsa_program (agent);
   init_kernel (kernel);
+  struct hsa_kernel_dispatch *shadow = create_kernel_dispatch_recursive
+    (kernel);
 
-  hsa_status_t status;
-  void *kernarg_addr;
-  /* Allocate the kernel argument buffer from the correct region.  */
-  status = hsa_memory_allocate (agent->kernarg_region,
-				kernel->kernarg_segment_size, &kernarg_addr);
-  if (status != HSA_STATUS_SUCCESS)
-    hsa_fatal ("Could not allocate memory for HSA kernel arguments", status);
-  hsa_signal_t sync_signal;
-  status = hsa_signal_create (1, 0, NULL, &sync_signal);
-  if (status != HSA_STATUS_SUCCESS)
-    hsa_fatal ("Error creating the HSA sync signal", status);
+  if (debug)
+    {
+      fprintf (stderr, "\nKernel has following dependencies:\n");
+      print_kernel_dispatch (shadow, 2);
+    }
 
   uint64_t index = hsa_queue_add_write_index_release (agent->command_q, 1);
   if (debug)
@@ -647,7 +877,7 @@  GOMP_OFFLOAD_run (int n, void *fn_ptr, void *vars)
   hsa_kernel_dispatch_packet_t *packet;
   packet = ((hsa_kernel_dispatch_packet_t*) agent->command_q->base_address)
     + index % agent->command_q->size;
-  hsa_signal_store_relaxed (sync_signal, 1);
+
   memset (((uint8_t *)packet) + 4, 0, sizeof (*packet) - 4);
   packet->setup  |= (uint16_t) 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
   packet->grid_size_x = 1;
@@ -659,9 +889,22 @@  GOMP_OFFLOAD_run (int n, void *fn_ptr, void *vars)
   packet->private_segment_size = kernel->private_segment_size;
   packet->group_segment_size = kernel->group_segment_size;
   packet->kernel_object = kernel->object;
-  packet->kernarg_address = kernarg_addr;
-  packet->completion_signal = sync_signal;
-  memcpy (kernarg_addr, &vars, sizeof(vars));
+  packet->kernarg_address = shadow->kernarg_address;
+  hsa_signal_t s;
+  s.handle = shadow->signal;
+  packet->completion_signal = s;
+  hsa_signal_store_relaxed (s, 1);
+  memcpy (shadow->kernarg_address, &vars, sizeof (vars));
+
+  /* Append shadow pointer to kernel arguments.  */
+  if (kernel->dependencies_count > 0)
+    {
+      memcpy (shadow->kernarg_address + sizeof (vars), &shadow,
+	      sizeof (struct hsa_kernel_runtime *));
+
+      if (debug)
+	fprintf (stderr, "Copying kernel runtime pointer to kernarg_address\n");
+    }
 
   uint16_t header;
   header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
@@ -676,12 +919,11 @@  GOMP_OFFLOAD_run (int n, void *fn_ptr, void *vars)
 
   if (debug)
     fprintf (stderr, "Kernel dispatched, waiting for completion\n");
-  hsa_signal_wait_acquire(sync_signal, HSA_SIGNAL_CONDITION_LT, 1,
-			  UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
-  if (debug)
-    fprintf (stderr, "Kernel %s returned\n", kernel->name);
-  hsa_signal_destroy(sync_signal);
-  hsa_memory_free (kernarg_addr);
+  hsa_signal_wait_acquire (s, HSA_SIGNAL_CONDITION_LT, 1,
+			   UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
+
+  release_kernel_dispatch (shadow);
+
   if (pthread_rwlock_unlock (&agent->modules_rwlock))
     GOMP_PLUGIN_fatal ("Unable to unlock an HSA agent rwlock");
 }
@@ -695,7 +937,8 @@  destroy_module (struct module_info *module)
   int i;
   for (i = 0; i < module->kernel_count; i++)
     if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
-      GOMP_PLUGIN_fatal ("Failed to destroy an HSA kernel initialization mutex");
+      GOMP_PLUGIN_fatal ("Failed to destroy an HSA kernel initialization "
+			 "mutex");
 }
 
 /* Part of the libgomp plugin interface.  Unload BRIG module described by
-- 
2.4.6