diff mbox series

[og10] openacc: Adjust loop lowering for AMD GCN

Message ID 20210113234842.71133-1-julian@codesourcery.com
State New
Headers show
Series [og10] openacc: Adjust loop lowering for AMD GCN | expand

Commit Message

Julian Brown Jan. 13, 2021, 11:48 p.m. UTC
This patch adjusts OpenACC loop lowering in the AMD GCN target compiler
in such a way that the autovectorizer can vectorize the "vector" dimension
of those loops in more cases.

Rather than generating "SIMT" code that executes a scalar instruction
stream for each lane of a vector in lockstep, for GCN we model the GPU
like a typical CPU, with separate instructions to operate on scalar and
vector data. That means that unlike other offload targets, we rely on
the autovectorizer to handle the innermost OpenACC parallelism level,
which is "vector".

Because of this, the OpenACC builtin functions to return the current
vector lane and the vector width return 0 and 1 respectively, despite
the native vector width being 64 elements wide.

This allows generated code to work with our chosen compilation model,
but the way loops are lowered in omp-offload.c:oacc_xform_loop does not
understand the discrepancy between logical (OpenACC) and physical vector
sizes correctly. That means that if a loop is partitioned over e.g. the
worker AND vector dimensions, we actually lower with unit vector size --
meaning that if we then autovectorize, we end up trying to vectorize
over the "worker" dimension rather than the vector one! Then, because
the number of workers is not fixed at compile time, that means the
autovectorizer has a hard time analysing the loop and thus vectorization
often fails entirely.

We can fix this by deducing the true vector width in oacc_xform_loop,
and using that when we are on a "non-SIMT" offload target. We can then
rearrange how loops are lowered in that function so that the loop form
fed to the autovectorizer is more amenable to vectorization -- namely,
the innermost step is set to process each loop iteration sequentially.

For some benchmarks, allowing vectorization to succeed leads to quite
impressive performance improvements -- I've observed between 2.5x and
40x on one machine/GPU combination.

The low-level builtins available to user code (__builtin_goacc_parlevel_id
and __builtin_goacc_parlevel_size) continue to return 0/1 respectively
for the vector dimension for AMD GCN, even if their containing loop is
vectorized -- that's a quirk that we might possibly want to address at
some later date.

Only non-"chunking" loops are handled at present. "Chunking" loops are
still lowered as before.

Tested with offloading to AMD GCN. I will apply to the og10 branch
shortly.

Julian

2021-01-13  Julian Brown  <julian@codesourcery.com>

gcc/
	* omp-offload.c (oacc_thread_numbers): Add VF_BY_VECTORIZER parameter.
	Add overloaded wrapper for previous arguments & behaviour.
	(oacc_xform_loop): Lower vector loops to iterate a multiple of
	omp_max_vf times over contiguous steps on non-SIMT targets.

libgomp/testsuite/
	* libgomp.oacc-c-c++-common/loop-gwv-1.c: Adjust for loop lowering
	changes.
	* libgomp.oacc-c-c++-common/loop-wv-1.c: Likewise.
	* libgomp.oacc-c-c++-common/loop-red-gwv-1.c: Likewise.
	* libgomp.oacc-c-c++-common/loop-red-wv-1.c: Likewise.
	* libgomp.oacc-c-c++-common/routine-gwv-1.c: Likewise.
	* libgomp.oacc-c-c++-common/routine-wv-1.c: Likewise.
---
 gcc/omp-offload.c                             | 160 ++++++++++++++----
 .../libgomp.oacc-c-c++-common/loop-gwv-1.c    |  15 +-
 .../loop-red-gwv-1.c                          |  17 +-
 .../libgomp.oacc-c-c++-common/loop-red-wv-1.c |  16 ++
 .../libgomp.oacc-c-c++-common/loop-wv-1.c     |  16 ++
 .../libgomp.oacc-c-c++-common/routine-gwv-1.c |  17 +-
 .../libgomp.oacc-c-c++-common/routine-wv-1.c  |  16 ++
 7 files changed, 214 insertions(+), 43 deletions(-)
diff mbox series

Patch

diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index f5ce34d3bdd8..bb3bfd130ee4 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -470,11 +470,13 @@  oacc_dim_call (bool pos, int dim, gimple_seq *seq)
 }
 
 /* Find the number of threads (POS = false), or thread number (POS =
-   true) for an OpenACC region partitioned as MASK.  Setup code
+   true) for an OpenACC region partitioned as MASK.  If VF_BY_VECTORIZER is
+   true, use that as the vectorization factor for the auto-vectorized
+   dimension size, instead of calling the builtin function.  Setup code
    required for the calculation is added to SEQ.  */
 
 static tree
-oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
+oacc_thread_numbers (bool pos, int mask, tree vf_by_vectorizer, gimple_seq *seq)
 {
   tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
   unsigned ix;
@@ -487,13 +489,15 @@  oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
 	  {
 	    /* We had an outer index, so scale that by the size of
 	       this dimension.  */
-	    tree n = oacc_dim_call (false, ix, seq);
+	    tree n = (ix == GOMP_DIM_VECTOR && vf_by_vectorizer)
+		     ? vf_by_vectorizer : oacc_dim_call (false, ix, seq);
 	    res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
 	  }
 	if (pos)
 	  {
 	    /* Determine index in this dimension.  */
-	    tree id = oacc_dim_call (true, ix, seq);
+	    tree id = (ix == GOMP_DIM_VECTOR && vf_by_vectorizer)
+		      ? integer_zero_node :  oacc_dim_call (true, ix, seq);
 	    if (res)
 	      res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
 	    else
@@ -507,6 +511,12 @@  oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
   return res;
 }
 
+static tree
+oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
+{
+  return oacc_thread_numbers (pos, mask, NULL_TREE, seq);
+}
+
 /* Transform IFN_GOACC_LOOP calls to actual code.  See
    expand_oacc_for for where these are generated.  At the vector
    level, we stride loops, such that each member of a warp will
@@ -534,6 +544,8 @@  oacc_xform_loop (gcall *call)
   bool chunking = false, striding = true;
   unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
   unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
+  bool vec_tiles = true;
+  tree vf_by_vectorizer = NULL_TREE;
 
   /* Skip lowering if return value of IFN_GOACC_LOOP call is not used.  */
   if (!lhs)
@@ -561,16 +573,39 @@  oacc_xform_loop (gcall *call)
       striding = integer_onep (chunk_size);
       chunking = !striding;
     }
+
+  if (!chunking
+      && !targetm.simt.vf
+      && (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
+    {
+      poly_uint64 max_vf = omp_max_vf ();
+      vf_by_vectorizer = build_int_cst (integer_type_node, max_vf);
+    }
+
 #endif
 
-  /* striding=true, chunking=true
+  /* For SIMT targets:
+
+     striding=true, chunking=true
        -> invalid.
      striding=true, chunking=false
        -> chunks=1
      striding=false,chunking=true
        -> chunks=ceil (range/(chunksize*threads*step))
      striding=false,chunking=false
-       -> chunk_size=ceil(range/(threads*step)),chunks=1  */
+       -> chunk_size=ceil(range/(threads*step)),chunks=1
+
+     For non-SIMT targets:
+
+      striding=N/A, chunking=true
+	-> as above, for now.
+      striding=N/A, chunking=false
+	-> chunks=1
+	   threads=gangs*workers*vf
+	   chunk_size=ceil(range/(threads*step))
+	   inner chunking loop steps by "step", vf*chunk_size times.
+  */
+
   push_gimplify_context (true);
 
   switch (code)
@@ -589,49 +624,83 @@  oacc_xform_loop (gcall *call)
 	  chunk_size = fold_convert (type, chunk_size);
 	  per = fold_build2 (MULT_EXPR, type, per, chunk_size);
 	  per = fold_build2 (MULT_EXPR, type, per, step);
-	  r = build2 (MINUS_EXPR, type, range, dir);
-	  r = build2 (PLUS_EXPR, type, r, per);
+	  r = fold_build2 (MINUS_EXPR, type, range, dir);
+	  r = fold_build2 (PLUS_EXPR, type, r, per);
 	  r = build2 (TRUNC_DIV_EXPR, type, r, per);
 	}
       break;
 
     case IFN_GOACC_LOOP_STEP:
       {
-	/* If striding, step by the entire compute volume, otherwise
-	   step by the inner volume.  */
-	unsigned volume = striding ? mask : inner_mask;
+	if (vf_by_vectorizer)
+	  r = step;
+	else
+	  {
+	    /* If striding, step by the entire compute volume, otherwise
+	       step by the inner volume.  */
+	    unsigned volume = striding ? mask : inner_mask;
 
-	r = oacc_thread_numbers (false, volume, &seq);
-	r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
+	    r = oacc_thread_numbers (false, volume, &seq);
+	    r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
+	  }
       }
       break;
 
     case IFN_GOACC_LOOP_OFFSET:
-      /* Enable vectorization on non-SIMT targets.  */
-      if (!targetm.simt.vf
-	  && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
+      if (vf_by_vectorizer)
+	{
 	  /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
 	     the loop.  */
-	  && (flag_tree_loop_vectorize
-	      || !global_options_set.x_flag_tree_loop_vectorize))
-	{
-	  basic_block bb = gsi_bb (gsi);
-	  class loop *parent = bb->loop_father;
-	  class loop *body = parent->inner;
-
-	  parent->force_vectorize = true;
-	  parent->safelen = INT_MAX;
-
-	  /* "Chunking loops" may have inner loops.  */
-	  if (parent->inner)
+	  if (flag_tree_loop_vectorize
+	      || !global_options_set.x_flag_tree_loop_vectorize)
 	    {
-	      body->force_vectorize = true;
-	      body->safelen = INT_MAX;
+	      /* Enable vectorization on non-SIMT targets.  */
+	      basic_block bb = gsi_bb (gsi);
+	      class loop *chunk_loop = bb->loop_father;
+	      class loop *inner_loop = chunk_loop->inner;
+
+	      /* Chunking isn't supported for VF_BY_VECTORIZER loops yet,
+		 so we know that the outer chunking loop will be executed just
+		 once and the inner loop is the one which must be
+		 vectorized (unless it has been optimized out for some
+		 reason).  */
+	      gcc_assert (!chunking);
+
+	      if (inner_loop)
+		{
+		  inner_loop->force_vectorize = true;
+		  inner_loop->safelen = INT_MAX;
+
+		  cfun->has_force_vectorize_loops = true;
+		}
 	    }
 
-	  cfun->has_force_vectorize_loops = true;
+	  /* ...and expand the abstract loops such that the vectorizer can
+	     work on them more effectively.
+
+	     It might be nicer to merge this code with the "!striding" case
+	     below, particularly if chunking support is added.  */
+	  tree warppos
+	    = oacc_thread_numbers (true, mask, vf_by_vectorizer, &seq);
+	  warppos = fold_convert (diff_type, warppos);
+
+	  tree volume
+	    = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
+	  volume = fold_convert (diff_type, volume);
+
+	  tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+	  chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
+	  chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
+	  chunk_size = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size,
+				    per);
+
+	  warppos = fold_build2 (MULT_EXPR, diff_type, warppos, chunk_size);
+
+	  tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
+	  chunk = fold_build2 (MULT_EXPR, diff_type, chunk, volume);
+	  r = fold_build2 (PLUS_EXPR, diff_type, chunk, warppos);
 	}
-      if (striding)
+      else if (striding)
 	{
 	  r = oacc_thread_numbers (true, mask, &seq);
 	  r = fold_convert (diff_type, r);
@@ -649,7 +718,7 @@  oacc_xform_loop (gcall *call)
 	  else
 	    {
 	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
-
+	      /* chunk_size = (range + per - 1) / per.  */
 	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
 	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
 	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
@@ -681,7 +750,28 @@  oacc_xform_loop (gcall *call)
       break;
 
     case IFN_GOACC_LOOP_BOUND:
-      if (striding)
+      if (vf_by_vectorizer)
+	{
+	  tree volume
+	    = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
+	  volume = fold_convert (diff_type, volume);
+
+	  tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+	  chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
+	  chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
+	  chunk_size = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size,
+				    per);
+
+	  vf_by_vectorizer = fold_convert (diff_type, vf_by_vectorizer);
+	  tree vecsize = fold_build2 (MULT_EXPR, diff_type, chunk_size,
+				      vf_by_vectorizer);
+	  vecsize = fold_build2 (MULT_EXPR, diff_type, vecsize, step);
+	  tree vecend = fold_convert (diff_type, gimple_call_arg (call, 6));
+	  vecend = fold_build2 (PLUS_EXPR, diff_type, vecend, vecsize);
+	  r = fold_build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR, diff_type,
+			   range, vecend);
+	}
+      else if (striding)
 	r = range;
       else
 	{
@@ -696,7 +786,7 @@  oacc_xform_loop (gcall *call)
 	  else
 	    {
 	      tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
-
+	      /* chunk_size = (range + per - 1) / per.  */
 	      chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
 	      chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
 	      chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
index 5c8430120618..c444543586fc 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c
@@ -45,12 +45,23 @@  int main ()
       int expected = ix;
       if(ondev)
 	{
-	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
-			   / (gangsize * workersize * vectorsize);
+#if defined (ACC_DEVICE_TYPE_radeon) && defined (__OPTIMIZE__)
+	  int use_vectorsize = 64;
+#else
+	  int use_vectorsize = vectorsize;
+#endif
+	  int chunk_size = (N + gangsize * workersize * use_vectorsize - 1)
+			   / (gangsize * workersize * use_vectorsize);
 	  
+#ifdef ACC_DEVICE_TYPE_radeon
+	  int g = ix / (chunk_size * workersize * use_vectorsize);
+	  int w = (ix / (chunk_size * use_vectorsize)) % workersize;
+	  int v = 0;
+#else
 	  int g = ix / (chunk_size * workersize * vectorsize);
 	  int w = (ix / vectorsize) % workersize;
 	  int v = ix % vectorsize;
+#endif
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
index 9c4a85f7b16b..1571236bfb49 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-gwv-1.c
@@ -40,12 +40,23 @@  int main ()
       int val = ix;
       if (ondev)
 	{
-	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
-			   / (gangsize * workersize * vectorsize);
-	  
+#if defined (ACC_DEVICE_TYPE_radeon) && defined (__OPTIMIZE__)
+	  int use_vectorsize = 64;
+#else
+	  int use_vectorsize = vectorsize;
+#endif
+	  int chunk_size = (N + gangsize * workersize * use_vectorsize - 1)
+			   / (gangsize * workersize * use_vectorsize);
+
+#ifdef ACC_DEVICE_TYPE_radeon
+	  int g = ix / (chunk_size * workersize * use_vectorsize);
+	  int w = (ix / (chunk_size * use_vectorsize)) % workersize;
+	  int v = 0;
+#else
 	  int g = ix / (chunk_size * vectorsize * workersize);
 	  int w = ix / vectorsize % workersize;
 	  int v = ix % vectorsize;
+#endif
 
 	  val = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
index c360ad11e7cb..423fbf442a29 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-red-wv-1.c
@@ -40,8 +40,24 @@  int main ()
       if(ondev)
 	{
 	  int g = 0;
+#ifdef ACC_DEVICE_TYPE_radeon
+#  ifdef __OPTIMIZE__
+	  int use_vecsize = 64;
+#  else
+	  int use_vecsize = vectorsize;
+#  endif
+	  /* For Radeon, the loop is split into contiguous blocks of
+	     chunk_size * vector_size, with chunk_size selected to cover the
+	     whole iteration space.  Each block is then autovectorized where
+	     possible.  */
+	  int chunk_size = (N + workersize * use_vecsize - 1)
+			   / (workersize * use_vecsize);
+	  int w = ix / (chunk_size * use_vecsize);
+	  int v = 0;
+#else
 	  int w = (ix / vectorsize) % workersize;
 	  int v = ix % vectorsize;
+#endif
 
 	  val = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
index fd4e4cf5ea9c..e4ffe3931bbe 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c
@@ -43,8 +43,24 @@  int main ()
       if(ondev)
 	{
 	  int g = 0;
+#ifdef ACC_DEVICE_TYPE_radeon
+#  ifdef __OPTIMIZE__
+	  int use_vecsize = 64;
+#  else
+	  int use_vecsize = vectorsize;
+#  endif
+	  /* For Radeon, the loop is split into contiguous blocks of
+	     chunk_size * vector_size, with chunk_size selected to cover the
+	     whole iteration space.  Each block is then autovectorized where
+	     possible.  */
+	  int chunk_size = (N + workersize * use_vecsize - 1)
+			   / (workersize * use_vecsize);
+	  int w = ix / (chunk_size * use_vecsize);
+	  int v = 0;
+#else
 	  int w = (ix / vectorsize) % workersize;
 	  int v = ix % vectorsize;
+#endif
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
index da13d84908a8..a4af1902d3d7 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-gwv-1.c
@@ -49,12 +49,23 @@  int main ()
       int expected = ix;
       if(ondev)
 	{
-	  int chunk_size = (N + gangsize * workersize * vectorsize - 1)
-			   / (gangsize * workersize * vectorsize);
+#if defined (ACC_DEVICE_TYPE_radeon) && defined (__OPTIMIZE__)
+	  int use_vectorsize = 64;
+#else
+	  int use_vectorsize = vectorsize;
+#endif
+	  int chunk_size = (N + gangsize * workersize * use_vectorsize - 1)
+			   / (gangsize * workersize * use_vectorsize);
 	  
-	  int g = ix / (chunk_size * vectorsize * workersize);
+#ifdef ACC_DEVICE_TYPE_radeon
+	  int g = ix / (chunk_size * workersize * use_vectorsize);
+	  int w = (ix / (chunk_size * use_vectorsize)) % workersize;
+	  int v = 0;
+#else
+	  int g = ix / (chunk_size * workersize * vectorsize);
 	  int w = (ix / vectorsize) % workersize;
 	  int v = ix % vectorsize;
+#endif
 
 	  expected = (g << 16) | (w << 8) | v;
 	}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
index 73696e4e59a3..091ef0682499 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/routine-wv-1.c
@@ -50,8 +50,24 @@  int main ()
       if(ondev)
 	{
 	  int g = 0;
+#ifdef ACC_DEVICE_TYPE_radeon
+#  ifdef __OPTIMIZE__
+	  int use_vecsize = 64;
+#  else
+	  int use_vecsize = vectorsize;
+#  endif
+	  /* For Radeon, the loop is split into contiguous blocks of
+	     chunk_size * vector_size, with chunk_size selected to cover the
+	     whole iteration space.  Each block is then autovectorized where
+	     possible.  */
+	  int chunk_size = (N + workersize * use_vecsize - 1)
+			   / (workersize * use_vecsize);
+	  int w = ix / (chunk_size * use_vecsize);
+	  int v = 0;
+#else
 	  int w = (ix / vectorsize) % workersize;
 	  int v = ix % vectorsize;
+#endif
 
 	  expected = (g << 16) | (w << 8) | v;
 	}