[nvptx] Don't allow vector_length 64 with num_workers 16

author Tom de Vries <tdevries@suse.de>

Fri, 11 Jan 2019 11:46:43 +0000 (11:46 +0000)

committer Tom de Vries <vries@gcc.gnu.org>

Fri, 11 Jan 2019 11:46:43 +0000 (11:46 +0000)
author Tom de Vries <tdevries@suse.de>
Fri, 11 Jan 2019 11:46:43 +0000 (11:46 +0000)
committer Tom de Vries <vries@gcc.gnu.org>
Fri, 11 Jan 2019 11:46:43 +0000 (11:46 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 5495b9a6f4a1e2f3fa99a8ca51aa254ffa2ecd9b..5d9c536e41411f63f74e0b55f29d77deefab8b5a 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2019-01-11  Tom de Vries  <tdevries@suse.de>
+
+       * config/nvptx/nvptx.c (PTX_CTA_NUM_BARRIERS, PTX_PER_CTA_BARRIER)
+       (PTX_NUM_PER_CTA_BARRIER, PTX_FIRST_PER_WORKER_BARRIER)
+       (PTX_NUM_PER_WORKER_BARRIERS): Define.
+       (nvptx_apply_dim_limits): Prevent vector_length 64 and
+       num_workers 16.
+
  2019-01-11  Tom de Vries  <tdevries@suse.de>
  
         * config/nvptx/nvptx.c (PTX_CTA_SIZE): Move up.
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c

index 643f5e86ccc4e47f662853f0de6f6f783c29e9f0..b37010ff58ed586136d2b6d5cde56707def185c3 100644 (file)
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -87,8 +87,14 @@
     2.x.  */
  #define PTX_CTA_SIZE 1024
  
+#define PTX_CTA_NUM_BARRIERS 16
  #define PTX_WARP_SIZE 32
  
+#define PTX_PER_CTA_BARRIER 0
+#define PTX_NUM_PER_CTA_BARRIERS 1
+#define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
+#define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
+
  #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
  #define PTX_MAX_VECTOR_LENGTH PTX_WARP_SIZE
  #define PTX_WORKER_LENGTH 32
@@ -5496,6 +5502,13 @@ nvptx_apply_dim_limits (int dims[])
    if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
        && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
      dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
+
+  /* If we need a per-worker barrier ... .  */
+  if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
+      && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
+    /* Don't use more barriers than available.  */
+    dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
+                                PTX_NUM_PER_WORKER_BARRIERS);
  }
  
  /* Return true if FNDECL contains calls to vector-partitionable routines.  */
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog

index ea339dec9632999586efcc8cd5c2ca0928ca9016..8c04d5a53f42f6e1041c2f010216c70f2671952b 100644 (file)
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,8 @@
+2019-01-11  Tom de Vries  <tdevries@suse.de>
+
+       * plugin/plugin-nvptx.c (nvptx_exec): Prevent vector_length 64 and
+       num_workers 16.
+
  2019-01-11  Tom de Vries  <tdevries@suse.de>
  
         * testsuite/libgomp.oacc-c-c++-common/reduction-1.c: Remove
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c

index 60553bdf3bd58a31eaf72f3f793e63bae22121de..c80da64c42251c72dd3c35e7b2f4d0d7d8699d0b 100644 (file)
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -1273,6 +1273,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
                                       : dims[GOMP_DIM_VECTOR]);
                 workers = blocks / actual_vectors;
                 workers = MAX (workers, 1);
+               /* If we need a per-worker barrier ... .  */
+               if (actual_vectors > 32)
+                 /* Don't use more barriers than available.  */
+                 workers = MIN (workers, 15);
               }
  
             for (i = 0; i != GOMP_DIM_MAX; i++)
@@ -1303,6 +1307,24 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
                          suggest_workers, suggest_workers);
      }
  
+  /* Check if the accelerator has sufficient barrier resources to
+     launch the offloaded kernel.  */
+  if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
+    {
+      const char *msg
+       = ("The Nvidia accelerator has insufficient barrier resources to launch"
+          " '%s' with num_workers = %d and vector_length = %d"
+          "; "
+          "recompile the program with 'num_workers = x' on that offloaded"
+          " region or '-fopenacc-dim=:x:' where x <= 15"
+          "; "
+          "or, recompile the program with 'vector_length = 32' on that"
+          " offloaded region"
+          ".\n");
+       GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
+                          dims[GOMP_DIM_VECTOR]);
+    }
+
    /* This reserves a chunk of a pre-allocated page of memory mapped on both
       the host and the device. HP is a host pointer to the new chunk, and DP is
       the corresponding device pointer.  */
author	Tom de Vries <tdevries@suse.de>
	Fri, 11 Jan 2019 11:46:43 +0000 (11:46 +0000)
committer	Tom de Vries <vries@gcc.gnu.org>
	Fri, 11 Jan 2019 11:46:43 +0000 (11:46 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/nvptx/nvptx.c		patch \| blob \| history
libgomp/ChangeLog		patch \| blob \| history
libgomp/plugin/plugin-nvptx.c		patch \| blob \| history