diff mbox

[2/5] OpenACC tile clause support, omp-low parts

Message ID c5f37d54-c921-50c0-6b42-b4b6fa5813b0@mentor.com
State New
Headers show

Commit Message

Chung-Lin Tang Nov. 10, 2016, 10:45 a.m. UTC
This part is the bulk of the patch set. It consists of
the definition of the GOACC_TILE internal fn, the lowering/expanding
of this in omp-low, as well as loop auto-partitioning adjustments
that help improve generated code.

This patch corresponds to the gomp4 committed patches:
https://gcc.gnu.org/ml/gcc-patches/2016-05/msg00518.html
https://gcc.gnu.org/ml/gcc-patches/2016-09/msg02031.html
https://gcc.gnu.org/ml/gcc-patches/2016-09/msg02351.html
https://gcc.gnu.org/ml/gcc-patches/2016-09/msg02418.html
https://gcc.gnu.org/ml/gcc-patches/2016-10/msg00107.html
https://gcc.gnu.org/ml/gcc-patches/2016-10/msg00152.html
https://gcc.gnu.org/ml/gcc-patches/2016-10/msg00314.html

with some small bits separated out to the prior patch in this series.

Thanks,
Chung-Lin

2016-XX-XX  Nathan Sidwell  <nathan@codesourcery.com>

        * internal-fn.def (GOACC_DIM_POS): Comment may be overly conservative.
        (GOACC_TILE): New.
        * internal-fn.c (expand_GOACC_TILE): New.

        * omp-low.c (struct omp_for_data): Add tiling field.
        (struct oacc_loop): Change 'ifns' to vector of call stmts,
        add e_mask field.
	(enum oacc_loop_flags): Add OLF_TILE flag.
	(extract_omp_for_data): Deal with tiling.
	(scan_sharing_clauses): Allow OMP_CLAUSE_TILE.
	(lower_oacc_head_mark): Add OLF_TILE as appropriate.  Ensure 2 levels
	for auto loops.  Remove default auto determining, moved to
	oacc_loop_fixed_partitions.
	(struct oacc_collapse): Add tile and outer fields.  */
	(expand_oacc_collapse_init): Add LOC paramter.  Initialize tile
	element fields.
	(expand_oacc_collapse_vars): Add INNER parm, adjust for tiling,
	avoid DIV for outermost collapse var.
	(expand_oacc_for): Insert tile element loop as needed.  Adjust.
	Remove out of date comments, fix whitespace.
	(oacc_dim_call): New function, abstracted out from oacc_thread_numbers.
	(oacc_thread_numbers): Use oacc_dim_call.
	(oacc_xform_tile): New.
	(new_oacc_loop_raw): Initialize e_mask, adjust for ifns vector.
	(finish_oacc_loop): Adjust for ifns vector.
	(oacc_loop_discover_walk): Append loop abstraction sites to list,
	add case for GOACC_TILE fns.
	(oacc_loop_xform_loop): Delete.
	(oacc_loop_process): Iterate over call list directly, and add
	handling for GOACC_TILE fns.
	(oacc_loop_fixed_partitions): Determine default auto, deal with TILE,
	dump partitioning.
	(oacc_loop_auto_partitions): Add outer_assign parm. Assign all but
        vector partitioning to outer loops.  Assign 2 partitions to loops
        when available. Add TILE handling.
        (oacc_loop_partition): Adjust oacc_loop_auto_partitions call.
        (execite_oacc_device_lower): Process GOACC_TILE fns, ignore unknown specs.

Comments

Jakub Jelinek Nov. 11, 2016, 10 a.m. UTC | #1
On Thu, Nov 10, 2016 at 06:45:10PM +0800, Chung-Lin Tang wrote:
> 2016-XX-XX  Nathan Sidwell  <nathan@codesourcery.com>
> 
>         * internal-fn.def (GOACC_DIM_POS): Comment may be overly conservative.
>         (GOACC_TILE): New.
>         * internal-fn.c (expand_GOACC_TILE): New.
> 
>         * omp-low.c (struct omp_for_data): Add tiling field.
>         (struct oacc_loop): Change 'ifns' to vector of call stmts,
>         add e_mask field.

Please avoid using 8 spaces instead of a tab in ChangeLog.

> 	dump partitioning.
> 	(oacc_loop_auto_partitions): Add outer_assign parm. Assign all but
>         vector partitioning to outer loops.  Assign 2 partitions to loops
>         when available. Add TILE handling.
>         (oacc_loop_partition): Adjust oacc_loop_auto_partitions call.
>         (execite_oacc_device_lower): Process GOACC_TILE fns, ignore unknown specs.

Here too.  execute instead of execite?  And the last line is too long.

> @@ -626,7 +638,8 @@ extract_omp_for_data (gomp_for *for_stmt, struct o
>    int cnt = fd->ordered ? fd->ordered : fd->collapse;
>    for (i = 0; i < cnt; i++)
>      {
> -      if (i == 0 && fd->collapse == 1 && (fd->ordered == 0 || loops == NULL))
> +      if (i == 0 && fd->collapse == 1 && !fd->tiling
> +	  && (fd->ordered == 0 || loops == NULL))
>  	loop = &fd->loop;
>        else if (loops != NULL)
>  	loop = loops + i;

If the condition fits on one line, it can stay as is, if it can't, then
you should use a
	if (i == 0
	    && fd->collapse == 1
	    && !fd->tiling
	    && (fd->ordered == 0 || loops == NULL))
IMHO.

> +	  tree tile = TREE_VALUE (tiling);
> +	  gcall *call = gimple_build_call_internal
> +	    (IFN_GOACC_TILE, 5, num, loop_no, tile,
> +	     /* gwv-outer=*/integer_zero_node,
> +	     /* gwv-inner=*/integer_zero_node);

I don't really like the ( on separate line unless absolutely necessary.
So better:

	  gcall *call
	    = gimple_build_call_internal (IFN_GOACC_TILE, 5, num, loop_no,
					  tile, integer_zero_node,
					  integer_zero_node);

> +	  call = gimple_build_call_internal
> +	    (IFN_GOACC_LOOP, 7,
> +	     build_int_cst (integer_type_node, IFN_GOACC_LOOP_OFFSET),
> +	     dir, e_range, element_s, chunk, e_gwv, chunk);

Similarly.  For the build_int_cst argument just add a temporary with
a short name (e.g. t) and initialize it to build_int_cst before the
gimple_build_call_internal.

> +	  gimple_call_set_lhs (call, e_offset);
> +	  gimple_set_location (call, loc);
> +	  gsi_insert_before (&gsi, call, GSI_SAME_STMT);
> +
> +	  call = gimple_build_call_internal
> +	    (IFN_GOACC_LOOP, 7,
> +	     build_int_cst (integer_type_node, IFN_GOACC_LOOP_BOUND),
> +	     dir, e_range, element_s, chunk, e_gwv, e_offset);
> +	  gimple_call_set_lhs (call, e_bound);
> +	  gimple_set_location (call, loc);
> +	  gsi_insert_before (&gsi, call, GSI_SAME_STMT);
> +
> +	  call = gimple_build_call_internal
> +	    (IFN_GOACC_LOOP, 6,
> +	     build_int_cst (integer_type_node, IFN_GOACC_LOOP_STEP),
> +	     dir, e_range, element_s, chunk, e_gwv);

And again 2x.

>    if (cont_bb)
>      {
> -      /* We now have one or two nested loops.  Update the loop
> +      /* We now have one,  two or three nested loops.  Update the loop

Only one space after , - we use 2 spaces only after full stop.

> @@ -11537,6 +11712,15 @@ expand_oacc_for (struct omp_region *region, struct
>  	  body_loop->header = body_bb;
>  	  body_loop->latch = cont_bb;
>  	  add_loop (body_loop, parent);
> +
> +	  if (fd->tiling)
> +	    {
> +	      // Insert tiling's element loop

Please use /* */ style comment instead, plus full stop:
	      /* Insert tiling's element loop.  */
> +	      struct loop *inner_loop = alloc_loop ();
> +	      inner_loop->header = elem_body_bb;
> +	      inner_loop->latch = elem_cont_bb;
> +	      add_loop (inner_loop, body_loop);

> +static void
> +oacc_xform_tile (gcall *call)
> +{
> +  gimple_stmt_iterator gsi = gsi_for_stmt (call);
> +  unsigned collapse = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
> +  /* Inner loops have higher loop_nos.  */
> +  unsigned loop_no = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 1));
> +  tree tile_size = gimple_call_arg (call, 2);
> +  unsigned e_mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 4));

Please use
  unsigned collapse = tree_to_uhwi (gimple_call_arg (call, 0));
etc. instead.

> +  tree lhs = gimple_call_lhs (call);
> +  tree type = TREE_TYPE (lhs);
> +  gimple_seq seq = NULL;
> +  tree span = build_int_cst (type, 1);
> +
> +  gcc_assert (!(e_mask
> +		& ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
> +		    | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
> +  push_gimplify_context (!seen_error ());
> +  if (
> +#ifndef ACCEL_COMPILER
> +      1 ||
> +#endif
> +      !e_mask)

We don't want || at the end of line.
Perhaps better
#ifndef ACCEL_COMPILER
  e_mask = 0;
#endif
  if (!e_mask)
?

> +	switch (gimple_call_internal_fn (call))
> +	  {
> +	  case IFN_GOACC_LOOP:
> +	    {
> +	      bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
> +	      *gimple_call_arg_ptr (call, 5) = is_e ? e_mask_arg : mask_arg;
> +	      if (!is_e)
> +		*gimple_call_arg_ptr (call, 4) = chunk_arg;
> +	    }
> +	    break;
>  
> +	  case IFN_GOACC_TILE:
> +	    *gimple_call_arg_ptr (call, 3) = mask_arg;
> +	    *gimple_call_arg_ptr (call, 4) = e_mask_arg;
> +	    break;

Please use
gimple_call_set_arg (call, idx, val)
instead of
*gimple_call_arg_ptr (call, idx) = val

> +      /* Apply auto partitioning if this is a non-partitioned regular
> +	 loop, or (no more than) single axis tiled loop.  */
> +      bool maybe_auto = !seq_par
> +	&& this_mask == (tiling ? this_mask & -this_mask : 0);

Wrong formatting, the && must go below !seq or better there should be ()
around the whole rhs.  Or maybe even better
      bool maybe_auto
	= !seq_par && this_mask == (tiling ? this_mask & -this_mask : 0);

>    if (loop->child)
>      {
> -      loop->inner = oacc_loop_fixed_partitions (loop->child,
> -						outer_mask | this_mask); 
> +      loop->inner = oacc_loop_fixed_partitions
> +	(loop->child, outer_mask | this_mask | loop->e_mask); 

See above for ( on next line.  Perhaps introduce a temporary holding
outer_mask | this_mask | loop->e_mask
or similar.

> -  if (assign && !loop->mask)
> +  if (loop->child)
> +    loop->inner = oacc_loop_auto_partitions
> +      (loop->child, outer_mask | loop->mask | loop->e_mask,
> +       outer_assign | assign);

Again.

>        /* Determine the outermost partitioning used within this loop. */
>        this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
>        this_mask = least_bit_hwi (this_mask);
>  
>        /* Pick the partitioning just inside that one.  */
>        this_mask >>= 1;
> -
>        /* And avoid picking one use by an outer loop. */
>        this_mask &= ~outer_mask;

Why remove the empty line?

	Jakub
diff mbox

Patch

Index: internal-fn.c
===================================================================
--- internal-fn.c	(revision 241809)
+++ internal-fn.c	(working copy)
@@ -2168,6 +2168,14 @@  expand_GOACC_REDUCTION (internal_fn, gcall *)
   gcc_unreachable ();
 }
 
+/* This is expanded by oacc_device_lower pass.  */
+
+static void
+expand_GOACC_TILE (internal_fn, gcall *)
+{
+  gcc_unreachable ();
+}
+
 /* Set errno to EDOM.  */
 
 static void
Index: internal-fn.def
===================================================================
--- internal-fn.def	(revision 241809)
+++ internal-fn.def	(working copy)
@@ -175,7 +175,7 @@  DEF_INTERNAL_FN (UNIQUE, ECF_NOTHROW, NULL)
    dimension.  DIM_POS is pure (and not const) so that it isn't
    thought to clobber memory and can be gcse'd within a single
    parallel region, but not across FORK/JOIN boundaries.  They take a
-   single INTEGER_CST argument.  */
+   single INTEGER_CST argument.  This might be overly conservative.  */
 DEF_INTERNAL_FN (GOACC_DIM_SIZE, ECF_CONST | ECF_NOTHROW | ECF_LEAF, ".")
 DEF_INTERNAL_FN (GOACC_DIM_POS, ECF_PURE | ECF_NOTHROW | ECF_LEAF, ".")
 
@@ -185,6 +185,10 @@  DEF_INTERNAL_FN (GOACC_LOOP, ECF_PURE | ECF_NOTHRO
 /* OpenACC reduction abstraction.  See internal-fn.h  for usage.  */
 DEF_INTERNAL_FN (GOACC_REDUCTION, ECF_NOTHROW | ECF_LEAF, NULL)
 
+/* Openacc tile abstraction. Describes the spans of the element loop.
+   GOACC_TILE (num-loops, loop-no, tile-arg, tile-mask, element-mask).  */
+DEF_INTERNAL_FN (GOACC_TILE, ECF_NOTHROW | ECF_LEAF, NULL)
+
 /* Set errno to EDOM, if GCC knows how to do that directly for the
    current target.  */
 DEF_INTERNAL_FN (SET_EDOM, ECF_LEAF | ECF_NOTHROW, NULL)
Index: omp-low.c
===================================================================
--- omp-low.c	(revision 241809)
+++ omp-low.c	(working copy)
@@ -213,7 +213,8 @@  struct omp_for_data
   tree chunk_size;
   gomp_for *for_stmt;
   tree pre, iter_type;
-  int collapse;
+  tree tiling;  /* Tiling values (if non null).  */
+  int collapse;  /* Collapsed loops, 1 for a non-collapsed loop.  */
   int ordered;
   bool have_nowait, have_ordered, simd_schedule;
   unsigned char sched_modifiers;
@@ -242,9 +243,10 @@  struct oacc_loop
   tree routine;  /* Pseudo-loop enclosing a routine.  */
 
   unsigned mask;   /* Partitioning mask.  */
+  unsigned e_mask; /* Partitioning of element loops (when tiling).  */
   unsigned inner;  /* Partitioning of inner loops.  */
   unsigned flags;  /* Partitioning flags.  */
-  unsigned ifns;   /* Contained loop abstraction functions.  */
+  vec<gcall *> ifns;  /* Contained loop abstraction functions.  */
   tree chunk_size; /* Chunk size.  */
   gcall *head_end; /* Final marker of head sequence.  */
 };
@@ -256,9 +258,10 @@  enum oacc_loop_flags {
   OLF_AUTO	= 1u << 1,	/* Compiler chooses axes.  */
   OLF_INDEPENDENT = 1u << 2,	/* Iterations are known independent.  */
   OLF_GANG_STATIC = 1u << 3,	/* Gang partitioning is static (has op). */
-
+  OLF_TILE	= 1u << 4,	/* Tiled loop. */
+  
   /* Explicitly specified loop axes.  */
-  OLF_DIM_BASE = 4,
+  OLF_DIM_BASE = 5,
   OLF_DIM_GANG   = 1u << (OLF_DIM_BASE + GOMP_DIM_GANG),
   OLF_DIM_WORKER = 1u << (OLF_DIM_BASE + GOMP_DIM_WORKER),
   OLF_DIM_VECTOR = 1u << (OLF_DIM_BASE + GOMP_DIM_VECTOR),
@@ -536,13 +539,9 @@  extract_omp_for_data (gomp_for *for_stmt, struct o
 
   fd->for_stmt = for_stmt;
   fd->pre = NULL;
-  if (gimple_omp_for_collapse (for_stmt) > 1)
-    fd->loops = loops;
-  else
-    fd->loops = &fd->loop;
-
   fd->have_nowait = distribute || simd;
   fd->have_ordered = false;
+  fd->tiling = NULL;
   fd->collapse = 1;
   fd->ordered = 0;
   fd->sched_kind = OMP_CLAUSE_SCHEDULE_STATIC;
@@ -587,9 +586,22 @@  extract_omp_for_data (gomp_for *for_stmt, struct o
 	    collapse_count = &OMP_CLAUSE_COLLAPSE_COUNT (t);
 	  }
 	break;
+      case OMP_CLAUSE_TILE:
+	fd->tiling = OMP_CLAUSE_TILE_LIST (t);
+	fd->collapse = list_length (fd->tiling);
+	gcc_assert (fd->collapse);
+	collapse_iter = &OMP_CLAUSE_TILE_ITERVAR (t);
+	collapse_count = &OMP_CLAUSE_TILE_COUNT (t);
+	break;
       default:
 	break;
       }
+
+  if (fd->collapse > 1 || fd->tiling)
+    fd->loops = loops;
+  else
+    fd->loops = &fd->loop;
+
   if (fd->ordered && fd->collapse == 1 && loops != NULL)
     {
       fd->loops = loops;
@@ -608,7 +620,7 @@  extract_omp_for_data (gomp_for *for_stmt, struct o
       fd->sched_kind = OMP_CLAUSE_SCHEDULE_STATIC;
       gcc_assert (fd->chunk_size == NULL);
     }
-  gcc_assert (fd->collapse == 1 || collapse_iter != NULL);
+  gcc_assert ((fd->collapse == 1 && !fd->tiling) || collapse_iter != NULL);
   if (taskloop)
     fd->sched_kind = OMP_CLAUSE_SCHEDULE_RUNTIME;
   if (fd->sched_kind == OMP_CLAUSE_SCHEDULE_RUNTIME)
@@ -626,7 +638,8 @@  extract_omp_for_data (gomp_for *for_stmt, struct o
   int cnt = fd->ordered ? fd->ordered : fd->collapse;
   for (i = 0; i < cnt; i++)
     {
-      if (i == 0 && fd->collapse == 1 && (fd->ordered == 0 || loops == NULL))
+      if (i == 0 && fd->collapse == 1 && !fd->tiling
+	  && (fd->ordered == 0 || loops == NULL))
 	loop = &fd->loop;
       else if (loops != NULL)
 	loop = loops + i;
@@ -655,7 +668,7 @@  extract_omp_for_data (gomp_for *for_stmt, struct o
 	  || (fd->sched_kind == OMP_CLAUSE_SCHEDULE_STATIC
 	      && !fd->have_ordered))
 	{
-	  if (fd->collapse == 1)
+	  if (fd->collapse == 1 && !fd->tiling)
 	    iter_type = TREE_TYPE (loop->v);
 	  else if (i == 0
 		   || TYPE_PRECISION (iter_type)
@@ -786,7 +799,7 @@  extract_omp_for_data (gomp_for *for_stmt, struct o
 	*collapse_count = create_tmp_var (iter_type, ".count");
     }
 
-  if (fd->collapse > 1 || (fd->ordered && loops))
+  if (fd->collapse > 1 || fd->tiling || (fd->ordered && loops))
     {
       fd->loop.v = *collapse_iter;
       fd->loop.n1 = build_int_cst (TREE_TYPE (fd->loop.v), 0);
@@ -2192,6 +2205,7 @@  scan_sharing_clauses (tree clauses, omp_context *c
 	case OMP_CLAUSE_INDEPENDENT:
 	case OMP_CLAUSE_AUTO:
 	case OMP_CLAUSE_SEQ:
+	case OMP_CLAUSE_TILE:
 	  break;
 
 	case OMP_CLAUSE_ALIGNED:
@@ -2201,7 +2215,6 @@  scan_sharing_clauses (tree clauses, omp_context *c
 	    install_var_local (decl, ctx);
 	  break;
 
-	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE__CACHE_:
 	default:
 	  gcc_unreachable ();
@@ -2362,10 +2375,10 @@  scan_sharing_clauses (tree clauses, omp_context *c
 	case OMP_CLAUSE_INDEPENDENT:
 	case OMP_CLAUSE_AUTO:
 	case OMP_CLAUSE_SEQ:
+	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE__GRIDDIM_:
 	  break;
 
-	case OMP_CLAUSE_TILE:
 	case OMP_CLAUSE__CACHE_:
 	default:
 	  gcc_unreachable ();
@@ -6349,6 +6362,10 @@  lower_oacc_head_mark (location_t loc, tree ddvar,
 	  tag |= OLF_INDEPENDENT;
 	  break;
 
+	case OMP_CLAUSE_TILE:
+	  tag |= OLF_TILE;
+	  break;
+
 	default:
 	  continue;
 	}
@@ -6366,14 +6383,20 @@  lower_oacc_head_mark (location_t loc, tree ddvar,
   if (!tgt || is_oacc_parallel (tgt))
     tag |= OLF_INDEPENDENT;
 
-  /* A loop lacking SEQ, GANG, WORKER and/or VECTOR is implicitly AUTO.  */
-  if (!(tag & (((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE)
-	       | OLF_SEQ)))
-      tag |= OLF_AUTO;
+  if (tag & OLF_TILE)
+    /* Tiling could use all 3 levels.  */ 
+    levels = 3;
+  else
+    {
+      /* A loop lacking SEQ, GANG, WORKER and/or VECTOR could be AUTO.
+	 Ensure at least one level, or 2 for possible auto
+	 partitioning */
+      bool maybe_auto = !(tag & (((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
+				  << OLF_DIM_BASE) | OLF_SEQ));
 
-  /* Ensure at least one level.  */
-  if (!levels)
-    levels++;
+      if (levels < 1u + maybe_auto)
+	levels = 1u + maybe_auto;
+    }
 
   args.quick_push (build_int_cst (integer_type_node, levels));
   args.quick_push (build_int_cst (integer_type_node, tag));
@@ -7445,7 +7468,9 @@  struct oacc_collapse
 {
   tree base;  /* Base value. */
   tree iters; /* Number of steps.  */
-  tree step;  /* step size.  */
+  tree step;  /* Step size.  */
+  tree tile;  /* Tile increment (if tiled).  */
+  tree outer; /* Tile iterator var. */
 };
 
 /* Helper for expand_oacc_for.  Determine collapsed loop information.
@@ -7455,15 +7480,20 @@  struct oacc_collapse
 static tree
 expand_oacc_collapse_init (const struct omp_for_data *fd,
 			   gimple_stmt_iterator *gsi,
-			   oacc_collapse *counts, tree bound_type)
+			   oacc_collapse *counts, tree bound_type,
+			   location_t loc)
 {
+  tree tiling = fd->tiling;
   tree total = build_int_cst (bound_type, 1);
   int ix;
   
   gcc_assert (integer_onep (fd->loop.step));
   gcc_assert (integer_zerop (fd->loop.n1));
 
-  for (ix = 0; ix != fd->collapse; ix++)
+  /* When tiling, the first operand of the tile clause applies to the
+     innermost loop, and we work outwards from there.  Seems
+     backwards, but whatever.  */
+  for (ix = fd->collapse; ix--;)
     {
       const omp_for_data_loop *loop = &fd->loops[ix];
 
@@ -7478,6 +7508,30 @@  expand_oacc_collapse_init (const struct omp_for_da
       if (POINTER_TYPE_P (diff_type) || TYPE_UNSIGNED (diff_type))
 	diff_type = signed_type_for (diff_type);
 
+      if (tiling)
+	{
+	  tree num = build_int_cst (integer_type_node, fd->collapse);
+	  tree loop_no = build_int_cst (integer_type_node, ix);
+	  tree tile = TREE_VALUE (tiling);
+	  gcall *call = gimple_build_call_internal
+	    (IFN_GOACC_TILE, 5, num, loop_no, tile,
+	     /* gwv-outer=*/integer_zero_node,
+	     /* gwv-inner=*/integer_zero_node);
+
+	  counts[ix].outer = create_tmp_var (iter_type, ".outer");
+	  counts[ix].tile = create_tmp_var (diff_type, ".tile");
+	  gimple_call_set_lhs (call, counts[ix].tile);
+	  gimple_set_location (call, loc);
+	  gsi_insert_before (gsi, call, GSI_SAME_STMT);
+
+	  tiling = TREE_CHAIN (tiling);
+	}
+      else
+	{
+	  counts[ix].tile = NULL;
+	  counts[ix].outer = loop->v;
+	}
+
       tree b = loop->n1;
       tree e = loop->n2;
       tree s = loop->step;
@@ -7531,13 +7585,14 @@  expand_oacc_collapse_init (const struct omp_for_da
   return total;
 }
 
-/* Emit initializers for collapsed loop members.  IVAR is the outer
+/* Emit initializers for collapsed loop members.  INNER is true if
+   this is for the element loop of a TILE.  IVAR is the outer
    loop iteration variable, from which collapsed loop iteration values
    are  calculated.  COUNTS array has been initialized by
    expand_oacc_collapse_inits.  */
 
 static void
-expand_oacc_collapse_vars (const struct omp_for_data *fd,
+expand_oacc_collapse_vars (const struct omp_for_data *fd, bool inner,
 			   gimple_stmt_iterator *gsi,
 			   const oacc_collapse *counts, tree ivar)
 {
@@ -7549,7 +7604,8 @@  static void
     {
       const omp_for_data_loop *loop = &fd->loops[ix];
       const oacc_collapse *collapse = &counts[ix];
-      tree iter_type = TREE_TYPE (loop->v);
+      tree v = inner ? loop->v : collapse->outer;
+      tree iter_type = TREE_TYPE (v);
       tree diff_type = TREE_TYPE (collapse->step);
       tree plus_type = iter_type;
       enum tree_code plus_code = PLUS_EXPR;
@@ -7561,24 +7617,25 @@  static void
 	  plus_type = sizetype;
 	}
 
-      expr = fold_build2 (TRUNC_MOD_EXPR, ivar_type, ivar,
-			  fold_convert (ivar_type, collapse->iters));
+      expr = ivar;
+      if (ix)
+	{
+	  tree mod = fold_convert (ivar_type, collapse->iters);
+	  ivar = fold_build2 (TRUNC_DIV_EXPR, ivar_type, expr, mod);
+	  expr = fold_build2 (TRUNC_MOD_EXPR, ivar_type, expr, mod);
+	  ivar = force_gimple_operand_gsi (gsi, ivar, true, NULL_TREE,
+					   true, GSI_SAME_STMT);
+	}
+
       expr = fold_build2 (MULT_EXPR, diff_type, fold_convert (diff_type, expr),
 			  collapse->step);
-      expr = fold_build2 (plus_code, iter_type, collapse->base,
+      expr = fold_build2 (plus_code, iter_type,
+			  inner ? collapse->outer : collapse->base,
 			  fold_convert (plus_type, expr));
       expr = force_gimple_operand_gsi (gsi, expr, false, NULL_TREE,
 				       true, GSI_SAME_STMT);
-      gassign *ass = gimple_build_assign (loop->v, expr);
+      gassign *ass = gimple_build_assign (v, expr);
       gsi_insert_before (gsi, ass, GSI_SAME_STMT);
-
-      if (ix)
-	{
-	  expr = fold_build2 (TRUNC_DIV_EXPR, ivar_type, ivar,
-			      fold_convert (ivar_type, collapse->iters));
-	  ivar = force_gimple_operand_gsi (gsi, expr, true, NULL_TREE,
-					   true, GSI_SAME_STMT);
-	}
     }
 }
 
@@ -11148,7 +11205,8 @@  expand_omp_taskloop_for_inner (struct omp_region *
    where LTGT is < or >.  We may have a specified chunking size, CHUNKING
    (constant 0 for no chunking) and we will have a GWV partitioning
    mask, specifying dimensions over which the loop is to be
-   partitioned (see note below).  We generate code that looks like:
+   partitioned (see note below).  We generate code that looks like
+   (this ignores tiling):
 
    <entry_bb> [incoming FALL->body, BRANCH->exit]
      typedef signedintify (typeof (V)) T;  // underlying signed integral type
@@ -11178,12 +11236,8 @@  expand_omp_taskloop_for_inner (struct omp_region *
    <exit_bb> [incoming]
      V = B + ((range -/+ 1) / S +/- 1) * S [*]
 
-   [*] Needed if V live at end of loop
+   [*] Needed if V live at end of loop.  */
 
-   Note: CHUNKING & GWV mask are specified explicitly here.  This is a
-   transition, and will be specified by a more general mechanism shortly.
- */
-
 static void
 expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
 {
@@ -11245,9 +11299,16 @@  expand_oacc_for (struct omp_region *region, struct
   tree step = create_tmp_var (diff_type, ".step");
   bool up = cond_code == LT_EXPR;
   tree dir = build_int_cst (diff_type, up ? +1 : -1);
-  bool chunking = !gimple_in_ssa_p (cfun);;
+  bool chunking = !gimple_in_ssa_p (cfun);
   bool negating;
 
+  /* Tiling vars.  */
+  tree tile_size = NULL_TREE;
+  tree element_s = NULL_TREE;
+  tree e_bound = NULL_TREE, e_offset = NULL_TREE, e_step = NULL_TREE;
+  basic_block elem_body_bb = NULL;
+  basic_block elem_cont_bb = NULL;
+
   /* SSA instances.  */
   tree offset_incr = NULL_TREE;
   tree offset_init = NULL_TREE;
@@ -11278,11 +11339,12 @@  expand_oacc_for (struct omp_region *region, struct
       gwv = build_int_cst (integer_type_node, GOMP_DIM_MASK (GOMP_DIM_GANG));
     }
 
-  if (fd->collapse > 1)
+  if (fd->collapse > 1 || fd->tiling)
     {
+      gcc_assert (!gimple_in_ssa_p (cfun) && up);
       counts = XALLOCAVEC (struct oacc_collapse, fd->collapse);
       tree total = expand_oacc_collapse_init (fd, &gsi, counts,
-					      TREE_TYPE (fd->loop.n2));
+					      TREE_TYPE (fd->loop.n2), loc);
 
       if (SSA_VAR_P (fd->loop.n2))
 	{
@@ -11291,7 +11353,6 @@  expand_oacc_for (struct omp_region *region, struct
 	  ass = gimple_build_assign (fd->loop.n2, total);
 	  gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
 	}
-      
     }
 
   tree b = fd->loop.n1;
@@ -11315,6 +11376,29 @@  expand_oacc_for (struct omp_region *region, struct
   expr = fold_convert (diff_type, chunk_size);
   chunk_size = force_gimple_operand_gsi (&gsi, expr, true,
 					 NULL_TREE, true, GSI_SAME_STMT);
+
+  if (fd->tiling)
+    {
+      /* Determine the tile size and element step,
+	 modify the outer loop step size.  */
+      tile_size = create_tmp_var (diff_type, ".tile_size");
+      expr = build_int_cst (diff_type, 1);
+      for (int ix = 0; ix < fd->collapse; ix++)
+	expr = fold_build2 (MULT_EXPR, diff_type, counts[ix].tile, expr);
+      expr = force_gimple_operand_gsi (&gsi, expr, true,
+				       NULL_TREE, true, GSI_SAME_STMT);
+      ass = gimple_build_assign (tile_size, expr);
+      gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+
+      element_s = create_tmp_var (diff_type, ".element_s");
+      ass = gimple_build_assign (element_s, s);
+      gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+
+      expr = fold_build2 (MULT_EXPR, diff_type, s, tile_size);
+      s = force_gimple_operand_gsi (&gsi, expr, true,
+				    NULL_TREE, true, GSI_SAME_STMT);
+    }
+
   /* Determine the range, avoiding possible unsigned->signed overflow. */
   negating = !up && TYPE_UNSIGNED (iter_type);
   expr = fold_build2 (MINUS_EXPR, plus_type,
@@ -11419,8 +11503,75 @@  expand_oacc_for (struct omp_region *region, struct
 				       true, GSI_SAME_STMT);
       ass = gimple_build_assign (v, expr);
       gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
-      if (fd->collapse > 1)
-	expand_oacc_collapse_vars (fd, &gsi, counts, v);
+
+      if (fd->collapse > 1 || fd->tiling)
+	expand_oacc_collapse_vars (fd, false, &gsi, counts, v);
+
+      if (fd->tiling)
+	{
+	  /* Determine the range of the element loop -- usually simply
+	     the tile_size, but could be smaller if the final
+	     iteration of the outer loop is a partial tile.  */
+	  tree e_range = create_tmp_var (diff_type, ".e_range");
+
+	  expr = build2 (MIN_EXPR, diff_type,
+			 build2 (MINUS_EXPR, diff_type, bound, offset),
+			 build2 (MULT_EXPR, diff_type, tile_size,
+				  element_s));
+	  expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE,
+					   true, GSI_SAME_STMT);
+	  ass = gimple_build_assign (e_range, expr);
+	  gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+
+	  /* Determine bound, offset & step of inner loop. */
+	  e_bound = create_tmp_var (diff_type, ".e_bound");
+	  e_offset = create_tmp_var (diff_type, ".e_offset");
+	  e_step = create_tmp_var (diff_type, ".e_step");
+
+	  /* Mark these as element loops.  */
+	  tree e_gwv = integer_minus_one_node;
+	  tree chunk = build_int_cst (diff_type, 0); /* Never chunked.  */
+
+	  call = gimple_build_call_internal
+	    (IFN_GOACC_LOOP, 7,
+	     build_int_cst (integer_type_node, IFN_GOACC_LOOP_OFFSET),
+	     dir, e_range, element_s, chunk, e_gwv, chunk);
+	  gimple_call_set_lhs (call, e_offset);
+	  gimple_set_location (call, loc);
+	  gsi_insert_before (&gsi, call, GSI_SAME_STMT);
+
+	  call = gimple_build_call_internal
+	    (IFN_GOACC_LOOP, 7,
+	     build_int_cst (integer_type_node, IFN_GOACC_LOOP_BOUND),
+	     dir, e_range, element_s, chunk, e_gwv, e_offset);
+	  gimple_call_set_lhs (call, e_bound);
+	  gimple_set_location (call, loc);
+	  gsi_insert_before (&gsi, call, GSI_SAME_STMT);
+
+	  call = gimple_build_call_internal
+	    (IFN_GOACC_LOOP, 6,
+	     build_int_cst (integer_type_node, IFN_GOACC_LOOP_STEP),
+	     dir, e_range, element_s, chunk, e_gwv);
+	  gimple_call_set_lhs (call, e_step);
+	  gimple_set_location (call, loc);
+	  gsi_insert_before (&gsi, call, GSI_SAME_STMT);
+
+	  /* Add test and split block.  */
+	  expr = build2 (cond_code, boolean_type_node, e_offset, e_bound);
+	  stmt = gimple_build_cond_empty (expr);
+	  gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+	  split = split_block (body_bb, stmt);
+	  elem_body_bb = split->dest;
+	  if (cont_bb == body_bb)
+	    cont_bb = elem_body_bb;
+	  body_bb = split->src;
+
+	  split->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
+
+	  /* Initialize the user's loop vars.  */
+	  gsi = gsi_start_bb (elem_body_bb);
+	  expand_oacc_collapse_vars (fd, true, &gsi, counts, e_offset);
+	}
     }
 
   /* Loop increment goes into cont_bb.  If this is not a loop, we
@@ -11434,9 +11585,33 @@  expand_oacc_for (struct omp_region *region, struct
       gomp_continue *cont_stmt = as_a <gomp_continue *> (gsi_stmt (gsi));
       loc = gimple_location (cont_stmt);
 
+      if (fd->tiling)
+	{
+	  /* Insert element loop increment and test.  */
+	  expr = build2 (PLUS_EXPR, diff_type, e_offset, e_step);
+	  expr = force_gimple_operand_gsi (&gsi, expr, false, NULL_TREE,
+					   true, GSI_SAME_STMT);
+	  ass = gimple_build_assign (e_offset, expr);
+	  gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
+	  expr = build2 (cond_code, boolean_type_node, e_offset, e_bound);
+
+	  stmt = gimple_build_cond_empty (expr);
+	  gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+	  split = split_block (cont_bb, stmt);
+	  elem_cont_bb = split->src;
+	  cont_bb = split->dest;
+
+	  split->flags ^= EDGE_FALLTHRU | EDGE_FALSE_VALUE;
+	  make_edge (elem_cont_bb, elem_body_bb, EDGE_TRUE_VALUE);
+
+	  make_edge (body_bb, cont_bb, EDGE_FALSE_VALUE);
+
+	  gsi = gsi_for_stmt (cont_stmt);
+	}
+
       /* Increment offset.  */
       if (gimple_in_ssa_p (cfun))
-	expr= build2 (plus_code, iter_type, offset,
+	expr = build2 (plus_code, iter_type, offset,
 		      fold_convert (plus_type, step));
       else
 	expr = build2 (PLUS_EXPR, diff_type, offset, step);
@@ -11510,7 +11685,7 @@  expand_oacc_for (struct omp_region *region, struct
 
   if (cont_bb)
     {
-      /* We now have one or two nested loops.  Update the loop
+      /* We now have one,  two or three nested loops.  Update the loop
 	 structures.  */
       struct loop *parent = entry_bb->loop_father;
       struct loop *body = body_bb->loop_father;
@@ -11537,6 +11712,15 @@  expand_oacc_for (struct omp_region *region, struct
 	  body_loop->header = body_bb;
 	  body_loop->latch = cont_bb;
 	  add_loop (body_loop, parent);
+
+	  if (fd->tiling)
+	    {
+	      // Insert tiling's element loop
+	      struct loop *inner_loop = alloc_loop ();
+	      inner_loop->header = elem_body_bb;
+	      inner_loop->latch = elem_cont_bb;
+	      add_loop (inner_loop, body_loop);
+	    }
 	}
     }
 }
@@ -18555,6 +18739,23 @@  omp_finish_file (void)
     }
 }
 
+/* Call dim_pos (POS == true) or dim_size (POS == false) builtins for
+   axis DIM.  Return a tmp var holding the result.  */
+
+static tree
+oacc_dim_call (bool pos, int dim, gimple_seq *seq)
+{
+  tree arg = build_int_cst (unsigned_type_node, dim);
+  tree size = create_tmp_var (integer_type_node);
+  enum internal_fn fn = pos ? IFN_GOACC_DIM_POS : IFN_GOACC_DIM_SIZE;
+  gimple *call = gimple_build_call_internal (fn, 1, arg);
+
+  gimple_call_set_lhs (call, size);
+  gimple_seq_add_stmt (seq, call);
+
+  return size;
+}
+
 /* Find the number of threads (POS = false), or thread number (POS =
    true) for an OpenACC region partitioned as MASK.  Setup code
    required for the calculation is added to SEQ.  */
@@ -18569,29 +18770,17 @@  oacc_thread_numbers (bool pos, int mask, gimple_se
   for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
     if (GOMP_DIM_MASK (ix) & mask)
       {
-	tree arg = build_int_cst (unsigned_type_node, ix);
-
 	if (res)
 	  {
 	    /* We had an outer index, so scale that by the size of
 	       this dimension.  */
-	    tree n = create_tmp_var (integer_type_node);
-	    gimple *call
-	      = gimple_build_call_internal (IFN_GOACC_DIM_SIZE, 1, arg);
-	    
-	    gimple_call_set_lhs (call, n);
-	    gimple_seq_add_stmt (seq, call);
+	    tree n = oacc_dim_call (false, ix, seq);
 	    res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
 	  }
 	if (pos)
 	  {
 	    /* Determine index in this dimension.  */
-	    tree id = create_tmp_var (integer_type_node);
-	    gimple *call = gimple_build_call_internal
-	      (IFN_GOACC_DIM_POS, 1, arg);
-
-	    gimple_call_set_lhs (call, id);
-	    gimple_seq_add_stmt (seq, call);
+	    tree id = oacc_dim_call (true, ix, seq);
 	    if (res)
 	      res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
 	    else
@@ -18790,6 +18979,84 @@  oacc_xform_loop (gcall *call)
   gsi_replace_with_seq (&gsi, seq, true);
 }
 
+/* Transform a GOACC_TILE call.  Determines the element loop span for
+   the specified loop of the nest.  This is 1 if we're not tiling.
+   
+   GOACC_TILE (collapse_count, loop_no, tile_arg, gwv_tile, gwv_element);  */
+
+static void
+oacc_xform_tile (gcall *call)
+{
+  gimple_stmt_iterator gsi = gsi_for_stmt (call);
+  unsigned collapse = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
+  /* Inner loops have higher loop_nos.  */
+  unsigned loop_no = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 1));
+  tree tile_size = gimple_call_arg (call, 2);
+  unsigned e_mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 4));
+  tree lhs = gimple_call_lhs (call);
+  tree type = TREE_TYPE (lhs);
+  gimple_seq seq = NULL;
+  tree span = build_int_cst (type, 1);
+
+  gcc_assert (!(e_mask
+		& ~(GOMP_DIM_MASK (GOMP_DIM_VECTOR)
+		    | GOMP_DIM_MASK (GOMP_DIM_WORKER))));
+  push_gimplify_context (!seen_error ());
+  if (
+#ifndef ACCEL_COMPILER
+      1 ||
+#endif
+      !e_mask)
+    /* Not paritioning.  */
+    span = integer_one_node;
+  else if (!integer_zerop (tile_size))
+    /* User explicitly specified size.  */
+    span = tile_size;
+  else
+    {
+      /* Pick a size based on the paritioning of the element loop and
+	 the number of loop nests.  */
+      tree first_size = NULL_TREE;
+      tree second_size = NULL_TREE;
+
+      if (e_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+	first_size = oacc_dim_call (false, GOMP_DIM_VECTOR, &seq);
+      if (e_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+	second_size = oacc_dim_call (false, GOMP_DIM_WORKER, &seq);
+
+      if (!first_size)
+	{
+	  first_size = second_size;
+	  second_size = NULL_TREE;
+	}
+
+      if (loop_no + 1 == collapse)
+	{
+	  span = first_size;
+	  if (!loop_no && second_size)
+	    span = fold_build2 (MULT_EXPR, TREE_TYPE (span),
+				span, second_size);
+	}
+      else if (loop_no + 2 == collapse)
+	span = second_size;
+      else
+	span = NULL_TREE;
+
+      if (!span)
+	/* There's no obvious element size for this loop.  Options
+	   are 1, first_size or some non-unity constant (32 is my
+	   favourite).   We should gather some statistics.  */
+	span = first_size;
+    }
+
+  span = fold_convert (type, span);
+  gimplify_assign (lhs, span, &seq);
+
+  pop_gimplify_context (NULL);
+
+  gsi_replace_with_seq (&gsi, seq, true);
+}
+
 /* Default partitioned and minimum partitioned dimensions.  */
 
 static int oacc_default_dims[GOMP_DIM_MAX];
@@ -18948,8 +19215,7 @@  new_oacc_loop_raw (oacc_loop *parent, location_t l
   memset (loop->tails, 0, sizeof (loop->tails));
   loop->routine = NULL_TREE;
 
-  loop->mask = loop->flags = loop->inner = 0;
-  loop->ifns = 0;
+  loop->mask = loop->e_mask = loop->flags = loop->inner = 0;
   loop->chunk_size = 0;
   loop->head_end = NULL;
 
@@ -19012,7 +19278,7 @@  static oacc_loop *
 finish_oacc_loop (oacc_loop *loop)
 {
   /* If the loop has been collapsed, don't partition it.  */
-  if (!loop->ifns)
+  if (loop->ifns.is_empty ())
     loop->mask = loop->flags = 0;
   return loop->parent;
 }
@@ -19148,9 +19414,10 @@  oacc_loop_discover_walk (oacc_loop *loop, basic_bl
 	  break;
 
 	case IFN_GOACC_LOOP:
-	  /* Count the goacc loop abstraction fns, to determine if the
-	     loop was collapsed already.  */
-	  loop->ifns++;
+	case IFN_GOACC_TILE:
+	  /* Record the abstraction function, so we can manipulate it
+	     later.  */
+	  loop->ifns.safe_push (call);
 	  break;
 
 	case IFN_UNIQUE:
@@ -19285,51 +19552,6 @@  oacc_loop_xform_head_tail (gcall *from, int level)
     }
 }
 
-/* Transform the IFN_GOACC_LOOP internal functions by providing the
-   determined partitioning mask and chunking argument.  END_MARKER
-   points at the end IFN_HEAD_TAIL call intgroducing the loop.  IFNS
-   is the number of IFN_GOACC_LOOP calls for the loop.  MASK_ARG is
-   the replacement partitioning mask and CHUNK_ARG is the replacement
-   chunking arg.  */
-
-static void
-oacc_loop_xform_loop (gcall *end_marker, unsigned ifns,
-		      tree mask_arg, tree chunk_arg)
-{
-  gimple_stmt_iterator gsi = gsi_for_stmt (end_marker);
-  
-  gcc_checking_assert (ifns);
-  for (;;)
-    {
-      for (; !gsi_end_p (gsi); gsi_next (&gsi))
-	{
-	  gimple *stmt = gsi_stmt (gsi);
-
-	  if (!is_gimple_call (stmt))
-	    continue;
-
-	  gcall *call = as_a <gcall *> (stmt);
-      
-	  if (!gimple_call_internal_p (call))
-	    continue;
-
-	  if (gimple_call_internal_fn (call) != IFN_GOACC_LOOP)
-	    continue;
-
-	  *gimple_call_arg_ptr (call, 5) = mask_arg;
-	  *gimple_call_arg_ptr (call, 4) = chunk_arg;
-	  ifns--;
-	  if (!ifns)
-	    return;
-	}
-
-      /* The LOOP_BOUND ifn could be in the single successor
-	 block.  */
-      basic_block bb = single_succ (gsi_bb (gsi));
-      gsi = gsi_start_bb (bb);
-    }
-}
-
 /* Process the discovered OpenACC loops, setting the correct
    partitioning level etc.  */
 
@@ -19342,13 +19564,34 @@  oacc_loop_process (oacc_loop *loop)
   if (loop->mask && !loop->routine)
     {
       int ix;
-      unsigned mask = loop->mask;
-      unsigned dim = GOMP_DIM_GANG;
-      tree mask_arg = build_int_cst (unsigned_type_node, mask);
+      tree mask_arg = build_int_cst (unsigned_type_node, loop->mask);
+      tree e_mask_arg = build_int_cst (unsigned_type_node, loop->e_mask);
       tree chunk_arg = loop->chunk_size;
+      gcall *call;
 
-      oacc_loop_xform_loop (loop->head_end, loop->ifns, mask_arg, chunk_arg);
+      for (ix = 0; loop->ifns.iterate (ix, &call); ix++)
+	switch (gimple_call_internal_fn (call))
+	  {
+	  case IFN_GOACC_LOOP:
+	    {
+	      bool is_e = gimple_call_arg (call, 5) == integer_minus_one_node;
+	      *gimple_call_arg_ptr (call, 5) = is_e ? e_mask_arg : mask_arg;
+	      if (!is_e)
+		*gimple_call_arg_ptr (call, 4) = chunk_arg;
+	    }
+	    break;
 
+	  case IFN_GOACC_TILE:
+	    *gimple_call_arg_ptr (call, 3) = mask_arg;
+	    *gimple_call_arg_ptr (call, 4) = e_mask_arg;
+	    break;
+
+	  default:
+	    gcc_unreachable ();
+	  }
+
+      unsigned dim = GOMP_DIM_GANG;
+      unsigned mask = loop->mask | loop->e_mask;
       for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
 	{
 	  while (!(GOMP_DIM_MASK (dim) & mask))
@@ -19388,10 +19631,16 @@  oacc_loop_fixed_partitions (oacc_loop *loop, unsig
     {
       bool auto_par = (loop->flags & OLF_AUTO) != 0;
       bool seq_par = (loop->flags & OLF_SEQ) != 0;
+      bool tiling = (loop->flags & OLF_TILE) != 0;
 
       this_mask = ((loop->flags >> OLF_DIM_BASE)
 		   & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
 
+      /* Apply auto partitioning if this is a non-partitioned regular
+	 loop, or (no more than) single axis tiled loop.  */
+      bool maybe_auto = !seq_par
+	&& this_mask == (tiling ? this_mask & -this_mask : 0);
+
       if ((this_mask != 0) + auto_par + seq_par > 1)
 	{
 	  if (noisy)
@@ -19399,7 +19648,7 @@  oacc_loop_fixed_partitions (oacc_loop *loop, unsig
 		      seq_par
 		      ? "%<seq%> overrides other OpenACC loop specifiers"
 		      : "%<auto%> conflicts with other OpenACC loop specifiers");
-	  auto_par = false;
+	  maybe_auto = false;
 	  loop->flags &= ~OLF_AUTO;
 	  if (seq_par)
 	    {
@@ -19408,15 +19657,18 @@  oacc_loop_fixed_partitions (oacc_loop *loop, unsig
 	      this_mask = 0;
 	    }
 	}
-      if (auto_par && (loop->flags & OLF_INDEPENDENT))
-	mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
+      if (maybe_auto && (loop->flags & OLF_INDEPENDENT))
+	{
+	  loop->flags |= OLF_AUTO;
+	  mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
+	}
     }
 
   if (this_mask & outer_mask)
     {
       const oacc_loop *outer;
       for (outer = loop->parent; outer; outer = outer->parent)
-	if (outer->mask & this_mask)
+	if ((outer->mask | outer->e_mask) & this_mask)
 	  break;
 
       if (noisy)
@@ -19462,13 +19714,33 @@  oacc_loop_fixed_partitions (oacc_loop *loop, unsig
 	}
     }
 
+  mask_all |= this_mask;
+
+  if (loop->flags & OLF_TILE)
+    {
+      /* When tiling, vector goes to the element loop, and failing
+	 that we put worker there.  The std doesn't contemplate
+	 specifying all three.  We choose to put worker and vector on
+	 the element loops in that case.  */
+      unsigned this_e_mask = this_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR);
+      if (!this_e_mask || this_mask & GOMP_DIM_MASK (GOMP_DIM_GANG))
+	this_e_mask |= this_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
+
+      loop->e_mask = this_e_mask;
+      this_mask ^= this_e_mask;
+    }
+
   loop->mask = this_mask;
-  mask_all |= this_mask;
-  
+
+  if (dump_file)
+    fprintf (dump_file, "Loop %s:%d user specified %d & %d\n",
+	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
+	     loop->mask, loop->e_mask);
+
   if (loop->child)
     {
-      loop->inner = oacc_loop_fixed_partitions (loop->child,
-						outer_mask | this_mask); 
+      loop->inner = oacc_loop_fixed_partitions
+	(loop->child, outer_mask | this_mask | loop->e_mask); 
       mask_all |= loop->inner;
     }
 
@@ -19480,14 +19752,17 @@  oacc_loop_fixed_partitions (oacc_loop *loop, unsig
 
 /* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
    OUTER_MASK is the partitioning this loop is contained within.
+   OUTER_ASSIGN is true if an outer loop is being auto-partitioned.
    Return the cumulative partitioning used by this loop, siblings and
    children.  */
 
 static unsigned
-oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
+oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
+			   bool outer_assign)
 {
   bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
   bool noisy = true;
+  bool tiling = loop->flags & OLF_TILE;
 
 #ifdef ACCEL_COMPILER
   /* When device_type is supported, we want the device compiler to be
@@ -19495,59 +19770,97 @@  static unsigned
   noisy = false;
 #endif
 
-  if (assign && outer_mask < GOMP_DIM_MASK (GOMP_DIM_MAX - 1))
+  if (assign && (!outer_assign | loop->inner))
     {
-      /* Allocate the outermost loop at the outermost available
-	 level.  */
-      unsigned this_mask = outer_mask + 1;
+      /* Allocate outermost and non-innermost loops at the outermost
+	 non-innermost available level.  */
+      unsigned this_mask = GOMP_DIM_MASK (GOMP_DIM_GANG);
+      
+      /* Find the first outermost available partition. */
+      while (this_mask <= outer_mask)
+	this_mask <<= 1;
 
-      if (!(this_mask & loop->inner))
-	loop->mask = this_mask;
-    }
+      /* Grab two axes if tiling, and we've not assigned anything  */
+      if (tiling && !(loop->mask | loop->e_mask))
+	this_mask |= this_mask << 1;
 
-  if (loop->child)
-    {
-      unsigned child_mask = outer_mask | loop->mask;
+      /* Prohibit the innermost partitioning at the moment.  */
+      this_mask &= GOMP_DIM_MASK (GOMP_DIM_MAX - 1) - 1;
 
-      if (loop->mask || assign)
-	child_mask |= GOMP_DIM_MASK (GOMP_DIM_MAX);
+      /* Don't use any dimension explicitly claimed by an inner loop. */
+      this_mask &= ~loop->inner;
 
-      loop->inner = oacc_loop_auto_partitions (loop->child, child_mask);
+      if (tiling && !loop->e_mask)
+	{
+	  /* If we got two axes, allocate the inner one to the element
+	     loop.  */
+	  loop->e_mask = this_mask & (this_mask << 1);
+	  this_mask ^= loop->e_mask;
+	}
+
+      loop->mask |= this_mask;
     }
 
-  if (assign && !loop->mask)
+  if (loop->child)
+    loop->inner = oacc_loop_auto_partitions
+      (loop->child, outer_mask | loop->mask | loop->e_mask,
+       outer_assign | assign);
+
+  if (assign && (!loop->mask || (tiling && !loop->e_mask) || !outer_assign))
     {
-      /* Allocate the loop at the innermost available level.  */
+      /* Allocate the loop at the innermost available level.  Note
+	 that we do this even if we already assigned this loop the
+	 outermost available level above.  That way we'll partition
+	 this along 2 axes, if they are available.  */
       unsigned this_mask = 0;
-      
+
       /* Determine the outermost partitioning used within this loop. */
       this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
       this_mask = least_bit_hwi (this_mask);
 
       /* Pick the partitioning just inside that one.  */
       this_mask >>= 1;
-
       /* And avoid picking one use by an outer loop. */
       this_mask &= ~outer_mask;
 
-      if (!this_mask && noisy)
+      /* If tiling and we failed completely above, grab the next one
+	 too.  Making sure it doesn't hit an outer loop.  */
+      if (tiling)
+	{
+	  this_mask &= ~(loop->e_mask | loop->mask);
+	  unsigned tile_mask = ((this_mask >> 1)
+				& ~(outer_mask | loop->e_mask | loop->mask));
+
+	  if (tile_mask || loop->mask)
+	    {
+	      loop->e_mask |= this_mask;
+	      this_mask = tile_mask;
+	    }
+	  if (!loop->e_mask && noisy)
+	    warning_at (loop->loc, 0,
+			"insufficient partitioning available"
+			" to parallelize element loop");
+	}
+
+      loop->mask |= this_mask;
+      if (!loop->mask && noisy)
 	warning_at (loop->loc, 0,
-		    "insufficient partitioning available to parallelize loop");
-
-      loop->mask = this_mask;
+		    "insufficient partitioning available"
+		    " to parallelize%s loop", tiling ? " tile" : "");
     }
 
   if (assign && dump_file)
-    fprintf (dump_file, "Auto loop %s:%d assigned %d\n",
+    fprintf (dump_file, "Auto loop %s:%d assigned %d & %d\n",
 	     LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
-	     loop->mask);
+	     loop->mask, loop->e_mask);
 
   unsigned inner_mask = 0;
   
   if (loop->sibling)
-    inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask);
+    inner_mask |= oacc_loop_auto_partitions (loop->sibling,
+					     outer_mask, outer_assign);
   
-  inner_mask |= loop->inner | loop->mask;
+  inner_mask |= loop->inner | loop->mask | loop->e_mask;
 
   return inner_mask;
 }
@@ -19563,7 +19876,7 @@  oacc_loop_partition (oacc_loop *loop, unsigned out
   if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
     {
       mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
-      mask_all |= oacc_loop_auto_partitions (loop, outer_mask);
+      mask_all |= oacc_loop_auto_partitions (loop, outer_mask, false);
     }
   return mask_all;
 }
@@ -19713,6 +20026,11 @@  execute_oacc_device_lower ()
 	  {
 	  default: break;
 
+	  case IFN_GOACC_TILE:
+	    oacc_xform_tile (call);
+	    rescan = true;
+	    break;
+
 	  case IFN_GOACC_LOOP:
 	    oacc_xform_loop (call);
 	    rescan = true;
@@ -19740,7 +20058,7 @@  execute_oacc_device_lower ()
 	      switch (kind)
 		{
 		default:
-		  gcc_unreachable ();
+		  break;
 
 		case IFN_UNIQUE_OACC_FORK:
 		case IFN_UNIQUE_OACC_JOIN: