[gomp4] Add tests for OpenACC worker-single/worker-partitioned modes
diff mbox

Message ID 20150604152534.03b2e27c@octopus
State New
Headers show

Commit Message

Julian Brown June 4, 2015, 2:25 p.m. UTC
Hi,

This patch adds a set of tests for worker-single predication (added
by Bernd in https://gcc.gnu.org/ml/gcc-patches/2015-06/msg00094.html)
and worker-partitioned mode for OpenACC.

Results generally look good, though support for synchronisation after
worker loops is currently missing, so the corresponding tests are
XFAILed for NVidia (I will look into fixing that).

I will apply shortly.

Thanks,

Julian

ChangeLog

    libgomp/
    * testsuite/libgomp.oacc-c-c++-common/
    worker-single-{1,1a,2,3,4,5,6}.c: New tests.
    * testsuite/libgomp.oacc-c-c++-common/
    worker-partn-{1,2,3,4,5,6,7}.c: New tests.

Patch
diff mbox

commit c4edb6e748c86c2bc5251707f61d4d37679194cf
Author: Julian Brown <julian@codesourcery.com>
Date:   Thu Jun 4 07:16:56 2015 -0700

    Add a set of OpenACC worker-single/worker-partitioned mode tests.

diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-1.c
new file mode 100644
index 0000000..1bdb8ea
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-1.c
@@ -0,0 +1,30 @@ 
+#include <assert.h>
+
+/* Test worker-partitioned/vector-single mode.  */
+
+int
+main (int argc, char *argv[])
+{
+  int arr[32 * 8], i;
+
+  for (i = 0; i < 32 * 8; i++)
+    arr[i] = 0;
+
+  #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+  {
+    int j;
+    #pragma acc loop gang
+    for (j = 0; j < 32; j++)
+      {
+	int k;
+	#pragma acc loop worker
+	for (k = 0; k < 8; k++)
+          arr[j * 8 + k] += j * 8 + k;
+      }
+  }
+
+  for (i = 0; i < 32 * 8; i++)
+    assert (arr[i] == i);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-2.c
new file mode 100644
index 0000000..1023e22
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-2.c
@@ -0,0 +1,44 @@ 
+#include <assert.h>
+
+/* Test condition in worker-partitioned mode.  */
+
+int
+main (int argc, char *argv[])
+{
+  int arr[32 * 32 * 8], i;
+
+  for (i = 0; i < 32 * 32 * 8; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+  {
+    int j;
+    #pragma acc loop gang
+    for (j = 0; j < 32; j++)
+      {
+	int k;
+	#pragma acc loop worker
+	for (k = 0; k < 8; k++)
+	  {
+	    int m;
+	    if ((k % 2) == 0)
+	      {
+		#pragma acc loop vector
+		for (m = 0; m < 32; m++)
+		  arr[j * 32 * 8 + k * 32 + m]++;
+	      }
+	    else
+	      {
+		#pragma acc loop vector
+		for (m = 0; m < 32; m++)
+		  arr[j * 32 * 8 + k * 32 + m] += 2;
+	      }
+	  }
+      }
+  }
+
+  for (i = 0; i < 32 * 32 * 8; i++)
+    assert (arr[i] == i + ((i / 32) % 2) + 1);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-3.c
new file mode 100644
index 0000000..a13a571
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-3.c
@@ -0,0 +1,54 @@ 
+#include <assert.h>
+
+/* Test switch in worker-partitioned mode.  */
+
+int
+main (int argc, char *argv[])
+{
+  int arr[32 * 32 * 8], i;
+
+  for (i = 0; i < 32 * 32 * 8; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+  {
+    int j;
+    #pragma acc loop gang
+    for (j = 0; j < 32; j++)
+      {
+	int k;
+	#pragma acc loop worker
+	for (k = 0; k < 8; k++)
+	  {
+	    int m;
+	    switch ((j * 32 + k) % 3)
+	    {
+	    case 0:
+	      #pragma acc loop vector
+	      for (m = 0; m < 32; m++)
+		arr[j * 32 * 8 + k * 32 + m]++;
+	      break;
+
+	    case 1:
+	      #pragma acc loop vector
+	      for (m = 0; m < 32; m++)
+		arr[j * 32 * 8 + k * 32 + m] += 2;
+	      break;
+
+	    case 2:
+	      #pragma acc loop vector
+	      for (m = 0; m < 32; m++)
+		arr[j * 32 * 8 + k * 32 + m] += 3;
+	      break;
+
+	    default: ;
+	    }
+	  }
+      }
+  }
+
+  for (i = 0; i < 32 * 32 * 8; i++)
+    assert (arr[i] == i + ((i / 32) % 3) + 1);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-4.c
new file mode 100644
index 0000000..0902c80
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-4.c
@@ -0,0 +1,54 @@ 
+#include <assert.h>
+
+/* Test worker-single/worker-partitioned transitions.  */
+
+int
+main (int argc, char *argv[])
+{
+  int n[32], arr[32 * 32], i;
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = 0;
+
+  for (i = 0; i < 32; i++)
+    n[i] = 0;
+
+  #pragma acc parallel copy(n, arr) num_gangs(8) num_workers(16) \
+	  vector_length(32)
+  {
+    int j;
+    #pragma acc loop gang
+    for (j = 0; j < 32; j++)
+      {
+	int k;
+
+	n[j]++;
+
+	#pragma acc loop worker
+	for (k = 0; k < 32; k++)
+          arr[j * 32 + k]++;
+
+	n[j]++;
+
+	#pragma acc loop worker
+	for (k = 0; k < 32; k++)
+          arr[j * 32 + k]++;
+
+	n[j]++;
+
+	#pragma acc loop worker
+	for (k = 0; k < 32; k++)
+          arr[j * 32 + k]++;
+
+	n[j]++;
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    assert (n[i] == 4);
+
+  for (i = 0; i < 32 * 32; i++)
+    assert (arr[i] == 3);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-5.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-5.c
new file mode 100644
index 0000000..fc66b04
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-5.c
@@ -0,0 +1,47 @@ 
+/* { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } { "*" } { "" } } */
+
+#include <assert.h>
+
+/* Test correct synchronisation between worker-partitioned loops.  */
+
+int
+main (int argc, char *argv[])
+{
+  int arr_a[32 * 32], arr_b[32 * 32], i;
+  int num_workers, num_gangs;
+
+  for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
+    for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
+      {
+	for (i = 0; i < 32 * 32; i++)
+	  arr_a[i] = i;
+
+	#pragma acc parallel copyin(arr_a) copyout(arr_b) num_gangs(num_gangs) \
+		num_workers(num_workers) vector_length(32)
+	{
+	  int j;
+	  #pragma acc loop gang
+	  for (j = 0; j < 32; j++)
+	    {
+	      int k;
+
+	      #pragma acc loop worker
+	      for (k = 0; k < 32; k++)
+        	arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2;
+
+	      #pragma acc loop worker
+	      for (k = 0; k < 32; k++)
+        	arr_a[j * 32 + (31 - k)] = arr_b[j * 32 + k] * 2;
+
+	      #pragma acc loop worker
+	      for (k = 0; k < 32; k++)
+        	arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2;
+	    }
+	}
+
+	for (i = 0; i < 32 * 32; i++)
+	  assert (arr_b[i] == (i ^ 31) * 8);
+      }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-6.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-6.c
new file mode 100644
index 0000000..0f3a1d7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-6.c
@@ -0,0 +1,47 @@ 
+/* { dg-xfail-run-if "TODO" { openacc_nvidia_accel_selected } { "*" } { "" } } */
+
+#include <assert.h>
+
+/* Test correct synchronisation between worker+vector-partitioned loops.  */
+
+int
+main (int argc, char *argv[])
+{
+  int arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i;
+  int num_workers, num_gangs;
+
+  for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
+    for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
+      {
+	for (i = 0; i < 32 * 32 * 32; i++)
+	  arr_a[i] = i;
+
+	#pragma acc parallel copyin(arr_a) copyout(arr_b) num_gangs(num_gangs) \
+		num_workers(num_workers) vector_length(32)
+	{
+	  int j;
+	  #pragma acc loop gang
+	  for (j = 0; j < 32; j++)
+	    {
+	      int k;
+
+	      #pragma acc loop worker vector
+	      for (k = 0; k < 32 * 32; k++)
+        	arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2;
+
+	      #pragma acc loop worker vector
+	      for (k = 0; k < 32 * 32; k++)
+        	arr_a[j * 32 * 32 + (1023 - k)] = arr_b[j * 32 * 32 + k] * 2;
+
+	      #pragma acc loop worker vector
+	      for (k = 0; k < 32 * 32; k++)
+        	arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2;
+	    }
+	}
+
+	for (i = 0; i < 32 * 32 * 32; i++)
+	  assert (arr_b[i] == (i ^ 1023) * 8);
+      }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-7.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-7.c
new file mode 100644
index 0000000..fe0c59c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-partn-7.c
@@ -0,0 +1,90 @@ 
+#include <assert.h>
+
+/* Test correct synchronisation between vector-partitioned loops in
+   worker-partitioned mode.  */
+
+int
+main (int argc, char *argv[])
+{
+  int n[32 * 32], arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i;
+  int num_workers, num_gangs;
+
+  for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
+    for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
+      {
+	for (i = 0; i < 32 * 32 * 32; i++)
+	  arr_a[i] = i;
+
+	for (i = 0; i < 32 * 32; i++)
+          n[i] = 0;
+
+	#pragma acc parallel copy (n) copyin(arr_a) copyout(arr_b) \
+		num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
+	{
+	  int j;
+	  #pragma acc loop gang
+	  for (j = 0; j < 32; j++)
+	    {
+	      int k;
+
+	      #pragma acc loop worker
+	      for (k = 0; k < 32; k++)
+		{
+		  int m;
+
+		  n[j * 32 + k]++;
+
+		  #pragma acc loop vector
+		  for (m = 0; m < 32; m++)
+		    {
+	              if (((j * 1024 + k * 32 + m) % 2) == 0)
+			arr_b[j * 1024 + k * 32 + (31 - m)]
+			  = arr_a[j * 1024 + k * 32 + m] * 2;
+		      else
+			arr_b[j * 1024 + k * 32 + (31 - m)]
+			  = arr_a[j * 1024 + k * 32 + m] * 3;
+		    }
+
+		  /* Test returning to vector-single mode...  */
+		  n[j * 32 + k]++;
+
+		  #pragma acc loop vector
+		  for (m = 0; m < 32; m++)
+		    {
+	              if (((j * 1024 + k * 32 + m) % 3) == 0)
+			arr_a[j * 1024 + k * 32 + (31 - m)]
+			  = arr_b[j * 1024 + k * 32 + m] * 5;
+		      else
+			arr_a[j * 1024 + k * 32 + (31 - m)]
+			  = arr_b[j * 1024 + k * 32 + m] * 7;
+		    }
+
+		  /* ...and back-to-back vector loops.  */
+
+		  #pragma acc loop vector
+		  for (m = 0; m < 32; m++)
+		    {
+	              if (((j * 1024 + k * 32 + m) % 2) == 0)
+			arr_b[j * 1024 + k * 32 + (31 - m)]
+			  = arr_a[j * 1024 + k * 32 + m] * 3;
+		      else
+			arr_b[j * 1024 + k * 32 + (31 - m)]
+			  = arr_a[j * 1024 + k * 32 + m] * 2;
+		    }
+		}
+	    }
+	}
+
+	for (i = 0; i < 32 * 32; i++)
+          assert (n[i] == 2);
+
+	for (i = 0; i < 32 * 32 * 32; i++)
+          {
+	    int m = 6 * ((i % 3) == 0 ? 5 : 7);
+	    assert (arr_b[i] == (i ^ 31) * m);
+	  }
+      }
+
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-1.c
new file mode 100644
index 0000000..34b2294
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-1.c
@@ -0,0 +1,25 @@ 
+#include <assert.h>
+
+/* Test worker-single/vector-single mode.  */
+
+int
+main (int argc, char *argv[])
+{
+  int arr[32], i;
+
+  for (i = 0; i < 32; i++)
+    arr[i] = 0;
+
+  #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+  {
+    int j;
+    #pragma acc loop gang
+    for (j = 0; j < 32; j++)
+      arr[j]++;
+  }
+
+  for (i = 0; i < 32; i++)
+    assert (arr[i] == 1);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-1a.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-1a.c
new file mode 100644
index 0000000..99c6dfb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-1a.c
@@ -0,0 +1,28 @@ 
+#include <assert.h>
+
+/* Test worker-single/vector-single mode.  */
+
+int
+main (int argc, char *argv[])
+{
+  int arr[32], i;
+
+  for (i = 0; i < 32; i++)
+    arr[i] = 0;
+
+  #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+  {
+    int j;
+    #pragma acc loop gang
+    for (j = 0; j < 32; j++)
+      {
+	#pragma acc atomic
+	arr[j]++;
+      }
+  }
+
+  for (i = 0; i < 32; i++)
+    assert (arr[i] == 1);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-2.c
new file mode 100644
index 0000000..9563e99
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-2.c
@@ -0,0 +1,28 @@ 
+#include <assert.h>
+
+/* Test condition in worker-single/vector-single mode.  */
+
+int
+main (int argc, char *argv[])
+{
+  int arr[32], i;
+
+  for (i = 0; i < 32; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+  {
+    int j;
+    #pragma acc loop gang
+    for (j = 0; j < 32; j++)
+      if ((arr[j] % 2) != 0)
+	arr[j]++;
+      else
+	arr[j] += 2;
+  }
+
+  for (i = 0; i < 32; i++)
+    assert (arr[i] == ((i % 2) != 0) ? i + 1 : i + 2);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-3.c
new file mode 100644
index 0000000..50d4887
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-3.c
@@ -0,0 +1,33 @@ 
+#include <assert.h>
+
+/* Test switch in worker-single/vector-single mode.  */
+
+int
+main (int argc, char *argv[])
+{
+  int arr[32], i;
+
+  for (i = 0; i < 32; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+  {
+    int j;
+    #pragma acc loop gang
+    for (j = 0; j < 32; j++)
+      switch (arr[j] % 5)
+	{
+	case 0: arr[j] += 1; break;
+	case 1: arr[j] += 2; break;
+	case 2: arr[j] += 3; break;
+	case 3: arr[j] += 4; break;
+	case 4: arr[j] += 5; break;
+	default: arr[j] += 99;
+	}
+  }
+
+  for (i = 0; i < 32; i++)
+    assert (arr[i] == i + (i % 5) + 1);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-4.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-4.c
new file mode 100644
index 0000000..bed90eb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-4.c
@@ -0,0 +1,33 @@ 
+#include <assert.h>
+
+/* Test worker-single/vector-partitioned mode.  */
+
+int
+main (int argc, char *argv[])
+{
+  int arr[32 * 32], i;
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+  {
+    int j;
+    #pragma acc loop gang
+    for (j = 0; j < 32; j++)
+      {
+	int k;
+	#pragma acc loop vector
+	for (k = 0; k < 32; k++)
+	  {
+	    #pragma acc atomic
+	    arr[j * 32 + k]++;
+	  }
+      }
+  }
+
+  for (i = 0; i < 32 * 32; i++)
+    assert (arr[i] == i + 1);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-5.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-5.c
new file mode 100644
index 0000000..3ec74a4
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-5.c
@@ -0,0 +1,49 @@ 
+#include <assert.h>
+
+/* Test multiple conditional vector-partitioned loops in worker-single
+   mode.  */
+
+int
+main (int argc, char *argv[])
+{
+  int arr[32 * 32], i;
+
+  for (i = 0; i < 32 * 32; i++)
+    arr[i] = i;
+
+  #pragma acc parallel copy(arr) num_gangs(8) num_workers(8) vector_length(32)
+  {
+    int j;
+    #pragma acc loop gang
+    for (j = 0; j < 32; j++)
+      {
+	int k;
+	if ((j % 3) == 0)
+	  {
+	    #pragma acc loop vector
+	    for (k = 0; k < 32; k++)
+	      {
+		#pragma acc atomic
+		arr[j * 32 + k] += 3;
+	      }
+	  }
+	else if ((j % 3) == 1)
+	  {
+	    #pragma acc loop vector
+	    for (k = 0; k < 32; k++)
+	      {
+		#pragma acc atomic
+		arr[j * 32 + k] += 7;
+	      }
+	  }
+      }
+  }
+
+  for (i = 0; i < 32 * 32; i++)
+    {
+      int j = (i / 32) % 3;
+      assert (arr[i] == i + ((j == 0) ? 3 : (j == 1) ? 7 : 0));
+    }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-6.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-6.c
new file mode 100644
index 0000000..b96ae2a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/worker-single-6.c
@@ -0,0 +1,46 @@ 
+#include <assert.h>
+
+#if defined(ACC_DEVICE_TYPE_host) || defined(ACC_DEVICE_TYPE_host_nonshm)
+#define ACTUAL_GANGS 1
+#else
+#define ACTUAL_GANGS 8
+#endif
+
+/* Test worker-single, vector-partitioned, gang-redundant mode.  */
+
+int
+main (int argc, char *argv[])
+{
+  int n, arr[32], i;
+
+  for (i = 0; i < 32; i++)
+    arr[i] = 0;
+
+  n = 0;
+
+  #pragma acc parallel copy(n, arr) num_gangs(ACTUAL_GANGS) num_workers(8) \
+	  vector_length(32)
+  {
+    int j;
+
+    #pragma acc atomic
+    n++;
+
+    #pragma acc loop vector
+    for (j = 0; j < 32; j++)
+      {
+	#pragma acc atomic
+	arr[j] += 1;
+      }
+
+    #pragma acc atomic
+    n++;
+  }
+
+  assert (n == ACTUAL_GANGS * 2);
+
+  for (i = 0; i < 32; i++)
+    assert (arr[i] == ACTUAL_GANGS);
+
+  return 0;
+}