From 052aaaceed25c962791b0b973f66febabaf47574 Mon Sep 17 00:00:00 2001
From: Tom de Vries <tdevries@suse.de>
Date: Fri, 11 Jan 2019 11:46:43 +0000
Subject: [PATCH] [nvptx] Don't allow vector_length 64 with num_workers 16

When using a compiler build with:
...
+#define PTX_DEFAULT_VECTOR_LENGTH PTX_CTA_SIZE
...

consider a test-case:
...
int
main (void)
{
  #pragma acc parallel vector_length (64)
    #pragma acc loop worker
    for (unsigned int i = 0; i < 32; i++)
      #pragma acc loop vector
      for (unsigned int j = 0; j < 64; j++)
        ;

  return 0;
}
...

If num_workers is 16, either because:
- we add a "num_workers (16)" clause on the parallel directive, or
- we set "GOMP_OPENACC_DIM=:16:", or
- the libgomp plugin chooses 16 num_workers
we run into an illegal instruction at runtime, because a bar.sync instruction
tries to use a barrier 16.  The instruction is illegal, because ptx supports
only 16 barriers per CTA, and the valid range is 0..15.

The problem is that with a warp-multiple vector length, we use a code generation
scheme with a per-worker barrier.  And because barrier zero is reserved for
per-cta barrier, only the remaining 15 barriers can be used as per-worker
barrier, and consequently we can't use num_workers larger than 15.

This problem occurs only for vector_length 64.  For vector_length 32, we use a
different code generation scheme, and for vector_length >= 96, the maximum
num_workers is not big enough not to trigger this problem.

Also, this problem only occurs for num_workers 16.  As explained above,
num_workers 15 is safe to use, and 16 is already the maximum num_workers for
vector_length 64.

This patch fixes the problem in both the compiler (handling "num_workers (16)")
and in the libgomp nvptx plugin (with and without "GOMP_OPENACC_DIM=:16:").

2019-01-11  Tom de Vries  <tdevries@suse.de>

	* config/nvptx/nvptx.c (PTX_CTA_NUM_BARRIERS, PTX_PER_CTA_BARRIER)
	(PTX_NUM_PER_CTA_BARRIER, PTX_FIRST_PER_WORKER_BARRIER)
	(PTX_NUM_PER_WORKER_BARRIERS): Define.
	(nvptx_apply_dim_limits): Prevent vector_length 64 and
	num_workers 16.

	* plugin/plugin-nvptx.c (nvptx_exec): Prevent vector_length 64 and
	num_workers 16.

From-SVN: r267838
---
 gcc/ChangeLog                 |  8 ++++++++
 gcc/config/nvptx/nvptx.c      | 13 +++++++++++++
 libgomp/ChangeLog             |  5 +++++
 libgomp/plugin/plugin-nvptx.c | 22 ++++++++++++++++++++++
 4 files changed, 48 insertions(+)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 5495b9a6f4a..5d9c536e414 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2019-01-11  Tom de Vries  <tdevries@suse.de>
+
+	* config/nvptx/nvptx.c (PTX_CTA_NUM_BARRIERS, PTX_PER_CTA_BARRIER)
+	(PTX_NUM_PER_CTA_BARRIER, PTX_FIRST_PER_WORKER_BARRIER)
+	(PTX_NUM_PER_WORKER_BARRIERS): Define.
+	(nvptx_apply_dim_limits): Prevent vector_length 64 and
+	num_workers 16.
+
 2019-01-11  Tom de Vries  <tdevries@suse.de>
 
 	* config/nvptx/nvptx.c (PTX_CTA_SIZE): Move up.
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 643f5e86ccc..b37010ff58e 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -87,8 +87,14 @@
    2.x.  */
 #define PTX_CTA_SIZE 1024
 
+#define PTX_CTA_NUM_BARRIERS 16
 #define PTX_WARP_SIZE 32
 
+#define PTX_PER_CTA_BARRIER 0
+#define PTX_NUM_PER_CTA_BARRIERS 1
+#define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
+#define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
+
 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
 #define PTX_MAX_VECTOR_LENGTH PTX_WARP_SIZE
 #define PTX_WORKER_LENGTH 32
@@ -5496,6 +5502,13 @@ nvptx_apply_dim_limits (int dims[])
   if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
       && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
     dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+
+  /* If we need a per-worker barrier ... .  */
+  if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
+      && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
+    /* Don't use more barriers than available.  */
+    dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
+				 PTX_NUM_PER_WORKER_BARRIERS);
 }
 
 /* Return true if FNDECL contains calls to vector-partitionable routines.  */
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index ea339dec963..8c04d5a53f4 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,8 @@
+2019-01-11  Tom de Vries  <tdevries@suse.de>
+
+	* plugin/plugin-nvptx.c (nvptx_exec): Prevent vector_length 64 and
+	num_workers 16.
+
 2019-01-11  Tom de Vries  <tdevries@suse.de>
 
 	* testsuite/libgomp.oacc-c-c++-common/reduction-1.c: Remove
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 60553bdf3bd..c80da64c422 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1273,6 +1273,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 				      : dims[GOMP_DIM_VECTOR]);
 		workers = blocks / actual_vectors;
 		workers = MAX (workers, 1);
+		/* If we need a per-worker barrier ... .  */
+		if (actual_vectors > 32)
+		  /* Don't use more barriers than available.  */
+		  workers = MIN (workers, 15);
 	      }
 
 	    for (i = 0; i != GOMP_DIM_MAX; i++)
@@ -1303,6 +1307,24 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 			 suggest_workers, suggest_workers);
     }
 
+  /* Check if the accelerator has sufficient barrier resources to
+     launch the offloaded kernel.  */
+  if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
+    {
+      const char *msg
+	= ("The Nvidia accelerator has insufficient barrier resources to launch"
+	   " '%s' with num_workers = %d and vector_length = %d"
+	   "; "
+	   "recompile the program with 'num_workers = x' on that offloaded"
+	   " region or '-fopenacc-dim=:x:' where x <= 15"
+	   "; "
+	   "or, recompile the program with 'vector_length = 32' on that"
+	   " offloaded region"
+	   ".\n");
+	GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
+			   dims[GOMP_DIM_VECTOR]);
+    }
+
   /* This reserves a chunk of a pre-allocated page of memory mapped on both
      the host and the device. HP is a host pointer to the new chunk, and DP is
      the corresponding device pointer.  */
-- 
2.30.2