diff mbox series

[OpenACC] Add target hook TARGET_GOACC_ADJUST_PARALLELISM

Message ID 7dd10c01-ff00-1b9b-77dd-ecd79fce55e3@mentor.com
State New
Headers show
Series [OpenACC] Add target hook TARGET_GOACC_ADJUST_PARALLELISM | expand

Commit Message

Cesar Philippidis Sept. 5, 2018, 8:05 p.m. UTC
At present, GCC fixes the vector length on all targets. However, that is
an artificial restriction. This patch introduces a new
TARGET_GOACC_ADJUST_PARALLELISM hook that enables the runtime to correct
the default number of acc workers and vectors. Extra care need to be
done to ensure that large vectors fit inside workers. The target hook
itself doesn't do anything for the host, but the nvptx BE will make use
of it.

Is this patch OK for trunk? I regtested and bootstrapped for x86_64 with
nvptx offloading.

Thanks,
Cesar
diff mbox series

Patch

[openacc] Add target hook TARGET_GOACC_ADJUST_PARALLELISM

	gcc/
	* doc/tm.texi.in: Add placeholder for TARGET_GOACC_ADJUST_PARALLELISM.
	* doc/tm.texi: Regenerate.
	* omp-offload.c (oacc_loop_fixed_partitions): Use the adjust_parallelism
	hook to modify this_mask.
	(oacc_loop_auto_partitions): Use the adjust_parallelism hook to modify
	this_mask and loop->mask.
	(default_goacc_adjust_parallelism): New function.
	* target.def (adjust_parallelism): New hook.
	* targhooks.h (default_goacc_adjust_parallelism): Declare.


diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index a40f45ade07..365a7bbec90 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6029,6 +6029,12 @@  This hook should return the maximum size of a particular dimension,
 or zero if unbounded.
 @end deftypefn
 
+@deftypefn {Target Hook} unsigned TARGET_GOACC_ADJUST_PARALLELISM (unsigned @var{this_mask}, unsigned @var{outer_mask})
+This hook allows the accelerator compiler to remove any unused
+parallelism exposed in the current loop @var{THIS_MASK}, and the
+enclosing loop @var{OUTER_MASK}.  It returns an adjusted mask.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_GOACC_FORK_JOIN (gcall *@var{call}, const int *@var{dims}, bool @var{is_fork})
 This hook can be used to convert IFN_GOACC_FORK and IFN_GOACC_JOIN
 function calls to target-specific gimple, or indicate whether they
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 39a214e9b2c..9edd2e7ecaf 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4145,6 +4145,8 @@  address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_GOACC_DIM_LIMIT
 
+@hook TARGET_GOACC_ADJUST_PARALLELISM
+
 @hook TARGET_GOACC_FORK_JOIN
 
 @hook TARGET_GOACC_REDUCTION
diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index 0abf0283c9e..1659febd2b1 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -1218,6 +1218,13 @@  oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
 	}
     }
 
+  /* Ideally, we should be coalescing parallelism here if the
+     hardware supports it.  E.g. Instead of partitioning a loop
+     across worker and vector axes, sometimes the hardware can
+     execute those loops together without resorting to placing
+     extra thread barriers.  */
+  this_mask = targetm.goacc.adjust_parallelism (this_mask, outer_mask);
+
   mask_all |= this_mask;
 
   if (loop->flags & OLF_TILE)
@@ -1302,6 +1309,7 @@  oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
 	  this_mask ^= loop->e_mask;
 	}
 
+      this_mask = targetm.goacc.adjust_parallelism (this_mask, outer_mask);
       loop->mask |= this_mask;
     }
 
@@ -1350,6 +1358,8 @@  oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask,
 	}
 
       loop->mask |= this_mask;
+      loop->mask = targetm.goacc.adjust_parallelism (loop->mask, outer_mask);
+
       if (!loop->mask && noisy)
 	warning_at (loop->loc, 0,
 		    tiling
@@ -1684,6 +1694,15 @@  default_goacc_dim_limit (int ARG_UNUSED (axis))
 #endif
 }
 
+/* Default adjustment of loop parallelism is not required.  */
+
+unsigned
+default_goacc_adjust_parallelism (unsigned this_mask,
+				  unsigned ARG_UNUSED (outer_mask))
+{
+  return this_mask;
+}
+
 namespace {
 
 const pass_data pass_data_oacc_device_lower =
diff --git a/gcc/target.def b/gcc/target.def
index c570f3825a5..401d681fc42 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1678,6 +1678,14 @@  or zero if unbounded.",
 int, (int axis),
 default_goacc_dim_limit)
 
+DEFHOOK
+(adjust_parallelism,
+"This hook allows the accelerator compiler to remove any unused\n\
+parallelism exposed in the current loop @var{THIS_MASK}, and the\n\
+enclosing loop @var{OUTER_MASK}.  It returns an adjusted mask.",
+unsigned, (unsigned this_mask, unsigned outer_mask),
+default_goacc_adjust_parallelism)
+
 DEFHOOK
 (fork_join,
 "This hook can be used to convert IFN_GOACC_FORK and IFN_GOACC_JOIN\n\
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index f92ca5ca997..38e024b13de 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -125,6 +125,7 @@  extern bool default_goacc_validate_dims (tree, int [], int);
 extern int default_goacc_dim_limit (int);
 extern bool default_goacc_fork_join (gcall *, const int [], bool);
 extern void default_goacc_reduction (gcall *);
+extern unsigned default_goacc_adjust_parallelism (unsigned, unsigned);
 
 /* These are here, and not in hooks.[ch], because not all users of
    hooks.h include tm.h, and thus we don't have CUMULATIVE_ARGS.  */
-- 
2.17.1