@@ -87,8 +87,14 @@
2.x. */
#define PTX_CTA_SIZE 1024
+#define PTX_CTA_NUM_BARRIERS 16
#define PTX_WARP_SIZE 32
+#define PTX_PER_CTA_BARRIER 0
+#define PTX_NUM_PER_CTA_BARRIERS 1
+#define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
+#define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
+
#define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
#define PTX_MAX_VECTOR_LENGTH PTX_WARP_SIZE
#define PTX_WORKER_LENGTH 32
@@ -5496,6 +5502,13 @@ nvptx_apply_dim_limits (int dims[])
if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
&& dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+
+ /* If we need a per-worker barrier ... . */
+ if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
+ && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
+ /* Don't use more barriers than available. */
+ dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
+ PTX_NUM_PER_WORKER_BARRIERS);
}
/* Return true if FNDECL contains calls to vector-partitionable routines. */
@@ -1273,6 +1273,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
: dims[GOMP_DIM_VECTOR]);
workers = blocks / actual_vectors;
workers = MAX (workers, 1);
+ /* If we need a per-worker barrier ... . */
+ if (actual_vectors > 32)
+ /* Don't use more barriers than available. */
+ workers = MIN (workers, 15);
}
for (i = 0; i != GOMP_DIM_MAX; i++)
@@ -1303,6 +1307,24 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
suggest_workers, suggest_workers);
}
+ /* Check if the accelerator has sufficient barrier resources to
+ launch the offloaded kernel. */
+ if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
+ {
+ const char *msg
+ = ("The Nvidia accelerator has insufficient barrier resources to launch"
+ " '%s' with num_workers = %d and vector_length = %d"
+ "; "
+ "recompile the program with 'num_workers = x' on that offloaded"
+ " region or '-fopenacc-dim=:x:' where x <= 15"
+ "; "
+ "or, recompile the program with 'vector_length = 32' on that"
+ " offloaded region"
+ ".\n");
+ GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
+ dims[GOMP_DIM_VECTOR]);
+ }
+
/* This reserves a chunk of a pre-allocated page of memory mapped on both
the host and the device. HP is a host pointer to the new chunk, and DP is
the corresponding device pointer. */