diff mbox

[WIP] OpenMP 4 NVPTX support

Message ID 20150421155839.GZ1725@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek April 21, 2015, 3:58 p.m. UTC
Hi!

Attached is a minimal patch to get at least a trivial OpenMP 4.0 testcase
offloading to NVPTX (the first patch).  The second patch is WIP, just first
few needed changes to make libgomp to build for NVPTX (several weeks of work
at least).

The following seems to work and the output suggests that it was offloaded to
a non-SHM arch:

int
main ()
{
  int v = 0;
  int *w = 0;
  int x = 0;
#pragma omp target
  {
    v = 6;
    w = &v;
    x = 1; // omp_is_initial_device ();
  }
  __builtin_printf ("%d %p %p %d\n", v, &v, w, x);
  return 0;
}

but already tiny bit more complicated testcase:

extern void *malloc (__SIZE_TYPE__);
extern void free (void *);

int
main ()
{
  int v = 0;
  int *w = 0;
  int x = 0;
#pragma omp target
  {
    v = 6;
    w = &v;
    char *p = malloc (64);
    x = 1; // omp_is_initial_device ();
    free (p);
  }
  __builtin_printf ("%d %p %p %d\n", v, &v, w, x);
  return 0;
}

suggests that while it is nice that when building nvptx accel compiler
we build libgcc.a, libc.a, libm.a, libgfortran.a (and in the future hopefully libgomp.a),
nothing attempts to link those in :(.

Is the plan to link those in at mkoffload time (haven't seen any attempt
of mkoffload to invoke the nvptx-none-ld linker though), or link those in
somehow at link_ptx time in the plugin?
In either case, it isn't clear to me how things will work (if at all) in the
case where multiple shared libraries (or executable and at least one shared
library) have their own offloading bits, and if you try to e.g. call an
offloaded function defined in the shared library from an offloaded kernel in
the executable, because if any library needs some global singleton case, if
it is linked multiple times, no idea what the PTX JIT will do.

Once that is resolved, another thing will be to figure out how to
efficiently implement the TLS libgomp needs for its ICVs and other state
- right now it uses either __thread, or pthread_getspecific, neither of
these is usable of course.  I've been thinking about an array of those
structures in .shared memory indexed by %tid.x, but I guess that runs into
the issue that the array would need to be declared fixed size and there is a
very small size limitation on .shared memory size.
So perhaps a file scope .shared pointer to global memory, where whomever
launches an OpenMP 4.0 kernel (either the libgomp-plugin-nvptx.so.1 doing
GOMP_run, or later on dynamic parallelism from GOMP_target in the nvptx
libgomp.a) allocates the memory and some wrapper sets the .shared variable
to that allocated memory, then calls the kernel?

	Jakub

--- configure.jj	2015-04-21 08:38:24.000000000 +0200
+++ configure	2015-04-21 09:16:42.994959648 +0200
@@ -3171,6 +3171,9 @@ if test x$enable_libgomp = x ; then
 	;;
     *-*-darwin* | *-*-aix*)
 	;;
+    # And on NVPTX as an offloading target.
+    nvptx*-*-*)
+	;;
     *)
 	noconfigdirs="$noconfigdirs target-libgomp"
 	;;
--- libgomp/configure.jj	2015-04-21 11:08:08.347628799 +0200
+++ libgomp/configure	2015-04-21 11:07:39.000000000 +0200
@@ -15038,6 +15038,9 @@ case "$host" in
   *-*-rtems*)
     # RTEMS supports Pthreads, but the library is not available at GCC build time.
     ;;
+  nvptx*-*-*)
+    # NVPTX does not support Pthreads, has its own code replacement.
+    ;;
   *)
     # Check to see if -pthread or -lpthread is needed.  Prefer the former.
     # In case the pthread.h system header is not found, this test will fail.
--- libgomp/configure.tgt.jj	2015-04-21 08:38:00.000000000 +0200
+++ libgomp/configure.tgt	2015-04-21 10:59:30.857197475 +0200
@@ -151,6 +151,10 @@ case "${target}" in
 	XLDFLAGS="${XLDFLAGS} -lpthread"
 	;;
 
+  nvptx*-*-*)
+	config_path="nvptx"
+	;;
+
   *)
 	;;
 
--- libgomp/config.h.in.jj	2015-04-21 08:38:01.000000000 +0200
+++ libgomp/config.h.in	2015-04-21 08:38:01.000000000 +0200
@@ -39,6 +39,9 @@
 /* Define if pthread_{,attr_}{g,s}etaffinity_np is supported. */
 #undef HAVE_PTHREAD_AFFINITY_NP
 
+/* Define to 1 if you have the <pthread.h> header file. */
+#undef HAVE_PTHREAD_H
+
 /* Define to 1 if you have the <semaphore.h> header file. */
 #undef HAVE_SEMAPHORE_H
 
--- libgomp/libgomp.h.jj	2015-04-21 08:38:00.000000000 +0200
+++ libgomp/libgomp.h	2015-04-21 11:15:35.952217394 +0200
@@ -40,7 +40,9 @@
 #include "gstdint.h"
 #include "libgomp-plugin.h"
 
+#ifdef HAVE_PTHREAD_H
 #include <pthread.h>
+#endif
 #include <stdbool.h>
 #include <stdlib.h>
 #include <stdarg.h>
--- libgomp/configure.ac.jj	2015-04-21 08:38:00.000000000 +0200
+++ libgomp/configure.ac	2015-04-21 11:06:38.418117846 +0200
@@ -179,6 +179,9 @@ case "$host" in
   *-*-rtems*)
     # RTEMS supports Pthreads, but the library is not available at GCC build time.
     ;;
+  nvptx*-*-*)
+    # NVPTX does not support Pthreads, has its own code replacement.
+    ;;
   *)
     # Check to see if -pthread or -lpthread is needed.  Prefer the former.
     # In case the pthread.h system header is not found, this test will fail.
--- configure.ac.jj	2015-04-21 08:38:09.000000000 +0200
+++ configure.ac	2015-04-21 09:14:50.107827544 +0200
@@ -539,6 +539,9 @@ if test x$enable_libgomp = x ; then
 	;;
     *-*-darwin* | *-*-aix*)
 	;;
+    # And on NVPTX as an offloading target.
+    nvptx*-*-*)
+	;;
     *)
 	noconfigdirs="$noconfigdirs target-libgomp"
 	;;

Comments

Bernd Schmidt April 22, 2015, 3:08 p.m. UTC | #1
On 04/21/2015 05:58 PM, Jakub Jelinek wrote:

> suggests that while it is nice that when building nvptx accel compiler
> we build libgcc.a, libc.a, libm.a, libgfortran.a (and in the future hopefully libgomp.a),
> nothing attempts to link those in :(.

I have that fixed; I expect I'll get around to posting this at some 
point now that stage1 is open.


Bernd
diff mbox

Patch

--- libgomp/plugin/plugin-nvptx.c.jj	2015-04-21 08:38:00.000000000 +0200
+++ libgomp/plugin/plugin-nvptx.c	2015-04-21 16:55:25.247470080 +0200
@@ -978,8 +978,8 @@  event_add (enum ptx_event_type type, CUe
 
 void
 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
-	  size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
-	  int vector_length, int async, void *targ_mem_desc)
+	    size_t *sizes, unsigned short *kinds, int num_gangs,
+	    int num_workers, int vector_length, int async, void *targ_mem_desc)
 {
   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
   CUfunction function;
@@ -1137,7 +1137,6 @@  nvptx_host2dev (void *d, const void *h,
   CUresult r;
   CUdeviceptr pb;
   size_t ps;
-  struct nvptx_thread *nvthd = nvptx_thread ();
 
   if (!s)
     return 0;
@@ -1162,7 +1161,8 @@  nvptx_host2dev (void *d, const void *h,
     GOMP_PLUGIN_fatal ("invalid size");
 
 #ifndef DISABLE_ASYNC
-  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+  struct nvptx_thread *nvthd = nvptx_thread ();
+  if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
     {
       CUevent *e;
 
@@ -1202,7 +1202,6 @@  nvptx_dev2host (void *h, const void *d,
   CUresult r;
   CUdeviceptr pb;
   size_t ps;
-  struct nvptx_thread *nvthd = nvptx_thread ();
 
   if (!s)
     return 0;
@@ -1227,7 +1226,8 @@  nvptx_dev2host (void *h, const void *d,
     GOMP_PLUGIN_fatal ("invalid size");
 
 #ifndef DISABLE_ASYNC
-  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
+  struct nvptx_thread *nvthd = nvptx_thread ();
+  if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
     {
       CUevent *e;
 
@@ -1559,7 +1559,8 @@  GOMP_OFFLOAD_get_name (void)
 unsigned int
 GOMP_OFFLOAD_get_caps (void)
 {
-  return GOMP_OFFLOAD_CAP_OPENACC_200;
+  return GOMP_OFFLOAD_CAP_OPENACC_200
+	 | GOMP_OFFLOAD_CAP_OPENMP_400;
 }
 
 int
@@ -1759,7 +1760,7 @@  GOMP_OFFLOAD_openacc_parallel (void (*fn
 			       void *targ_mem_desc)
 {
   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
-	    num_workers, vector_length, async, targ_mem_desc);
+	      num_workers, vector_length, async, targ_mem_desc);
 }
 
 void
@@ -1889,3 +1890,27 @@  GOMP_OFFLOAD_openacc_set_cuda_stream (in
 {
   return nvptx_set_cuda_stream (async, stream);
 }
+
+void
+GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars)
+{
+  CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
+  CUresult r;
+  struct ptx_device *ptx_dev = ptx_devices[ord];
+  const char *maybe_abort_msg = "(perhaps abort was called)";
+  void *args = &tgt_vars;
+
+  r = cuLaunchKernel (function,
+		      1, 1, 1,
+		      1, 1, 1,
+		      0, ptx_dev->null_stream->stream, &args, 0);
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
+
+  r = cuCtxSynchronize ();
+  if (r == CUDA_ERROR_LAUNCH_FAILED)
+    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
+		       maybe_abort_msg);
+  else if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
+}