+2019-01-11 Tom de Vries <tdevries@suse.de>
+
+ * config/nvptx/nvptx.c (PTX_CTA_NUM_BARRIERS, PTX_PER_CTA_BARRIER)
+ (PTX_NUM_PER_CTA_BARRIER, PTX_FIRST_PER_WORKER_BARRIER)
+ (PTX_NUM_PER_WORKER_BARRIERS): Define.
+ (nvptx_apply_dim_limits): Prevent vector_length 64 and
+ num_workers 16.
+
2019-01-11 Tom de Vries <tdevries@suse.de>
* config/nvptx/nvptx.c (PTX_CTA_SIZE): Move up.
2.x. */
#define PTX_CTA_SIZE 1024
+#define PTX_CTA_NUM_BARRIERS 16
#define PTX_WARP_SIZE 32
+#define PTX_PER_CTA_BARRIER 0
+#define PTX_NUM_PER_CTA_BARRIERS 1
+#define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
+#define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
+
#define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
#define PTX_MAX_VECTOR_LENGTH PTX_WARP_SIZE
#define PTX_WORKER_LENGTH 32
if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
&& dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+
+ /* If we need a per-worker barrier ... . */
+ if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
+ && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
+ /* Don't use more barriers than available. */
+ dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
+ PTX_NUM_PER_WORKER_BARRIERS);
}
/* Return true if FNDECL contains calls to vector-partitionable routines. */
+2019-01-11 Tom de Vries <tdevries@suse.de>
+
+ * plugin/plugin-nvptx.c (nvptx_exec): Prevent vector_length 64 and
+ num_workers 16.
+
2019-01-11 Tom de Vries <tdevries@suse.de>
* testsuite/libgomp.oacc-c-c++-common/reduction-1.c: Remove
: dims[GOMP_DIM_VECTOR]);
workers = blocks / actual_vectors;
workers = MAX (workers, 1);
+ /* If we need a per-worker barrier ... . */
+ if (actual_vectors > 32)
+ /* Don't use more barriers than available. */
+ workers = MIN (workers, 15);
}
for (i = 0; i != GOMP_DIM_MAX; i++)
suggest_workers, suggest_workers);
}
+ /* Check if the accelerator has sufficient barrier resources to
+ launch the offloaded kernel. */
+ if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
+ {
+ const char *msg
+ = ("The Nvidia accelerator has insufficient barrier resources to launch"
+ " '%s' with num_workers = %d and vector_length = %d"
+ "; "
+ "recompile the program with 'num_workers = x' on that offloaded"
+ " region or '-fopenacc-dim=:x:' where x <= 15"
+ "; "
+ "or, recompile the program with 'vector_length = 32' on that"
+ " offloaded region"
+ ".\n");
+ GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
+ dims[GOMP_DIM_VECTOR]);
+ }
+
/* This reserves a chunk of a pre-allocated page of memory mapped on both
the host and the device. HP is a host pointer to the new chunk, and DP is
the corresponding device pointer. */