[nvptx] Use CUDA driver API to select default runtime launch geometry

author Cesar Philippidis <cesar@codesourcery.com>

Mon, 13 Aug 2018 12:04:24 +0000 (05:04 -0700)

committer Tom de Vries <vries@gcc.gnu.org>

Mon, 13 Aug 2018 12:04:24 +0000 (12:04 +0000)
author Cesar Philippidis <cesar@codesourcery.com>
Mon, 13 Aug 2018 12:04:24 +0000 (05:04 -0700)
committer Tom de Vries <vries@gcc.gnu.org>
Mon, 13 Aug 2018 12:04:24 +0000 (12:04 +0000)
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog

index 4aff6cd9a3325c2092edc0958b2cb8d04e16e5e9..f54a695fb38b456dabe7c268e67454d6eee8e94b 100644 (file)
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,17 @@
+2018-08-13  Cesar Philippidis  <cesar@codesourcery.com>
+           Tom de Vries  <tdevries@suse.de>
+
+       PR target/85590
+       * plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
+       (cuOccupancyMaxPotentialBlockSize): Declare.
+       * plugin/cuda-lib.def (cuOccupancyMaxPotentialBlockSize): New
+       CUDA_ONE_CALL_MAYBE_NULL.
+       * plugin/plugin-nvptx.c (CUDA_VERSION < 6050): Define
+       CUoccupancyB2DSize and declare
+       cuOccupancyMaxPotentialBlockSize.
+       (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
+       default num_gangs and num_workers when the driver supports it.
+
  2018-08-08  Tom de Vries  <tdevries@suse.de>
  
         * plugin/cuda-lib.def (cuLinkAddData_v2, cuLinkCreate_v2): Declare using
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def

index 29028b504a052833905f2dae9ff33944cdbcc02b..b2a4c2154ebce153200e4579dc88b00cdd00d94e 100644 (file)
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -41,6 +41,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
  CUDA_ONE_CALL (cuModuleLoad)
  CUDA_ONE_CALL (cuModuleLoadData)
  CUDA_ONE_CALL (cuModuleUnload)
+CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
  CUDA_ONE_CALL (cuStreamCreate)
  CUDA_ONE_CALL (cuStreamDestroy)
  CUDA_ONE_CALL (cuStreamQuery)
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h

index 4799825bda2b773f19e270d2afa9718d2cf46d0c..b4c1b29c5d895bb412974b8eadf7c8a1c995abe3 100644 (file)
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
  typedef void *CUfunction;
  typedef void *CUlinkState;
  typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
  typedef void *CUstream;
  
  typedef enum {
@@ -170,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
  CUresult cuModuleLoad (CUmodule *, const char *);
  CUresult cuModuleLoadData (CUmodule *, const void *);
  CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+                                         CUoccupancyB2DSize, size_t, int);
  CUresult cuStreamCreate (CUstream *, unsigned);
  #define cuStreamDestroy cuStreamDestroy_v2
  CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c

index 6799a264976d5102ebe0897c1ab8b53a4008fb30..bae1b05ccaa9f6c530c1f36898a73eff2aa1a8e3 100644 (file)
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -61,9 +61,12 @@ CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
                         const char *, unsigned, CUjit_option *, void **);
  CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
  #else
+typedef size_t (*CUoccupancyB2DSize)(int);
  CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
                            const char *, unsigned, CUjit_option *, void **);
  CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+                                         CUoccupancyB2DSize, size_t, int);
  #endif
  
  #define DO_PRAGMA(x) _Pragma (#x)
@@ -1200,21 +1203,77 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
        {
         bool default_dim_p[GOMP_DIM_MAX];
         for (i = 0; i != GOMP_DIM_MAX; i++)
+         default_dim_p[i] = !dims[i];
+
+       if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
           {
-           default_dim_p[i] = !dims[i];
-           if (default_dim_p[i])
-             dims[i] = nvthd->ptx_dev->default_dims[i];
+           for (i = 0; i != GOMP_DIM_MAX; i++)
+             if (default_dim_p[i])
+               dims[i] = nvthd->ptx_dev->default_dims[i];
+
+           if (default_dim_p[GOMP_DIM_VECTOR])
+             dims[GOMP_DIM_VECTOR]
+               = MIN (dims[GOMP_DIM_VECTOR],
+                      (targ_fn->max_threads_per_block / warp_size
+                       * warp_size));
+
+           if (default_dim_p[GOMP_DIM_WORKER])
+             dims[GOMP_DIM_WORKER]
+               = MIN (dims[GOMP_DIM_WORKER],
+                      targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
           }
+       else
+         {
+           /* Handle the case that the compiler allows the runtime to choose
+              the vector-length conservatively, by ignoring
+              gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
+              it.  */
+           int vectors = 0;
+           /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
+              gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
+              exceed targ_fn->max_threads_per_block. */
+           int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
+           int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
+           int grids, blocks;
+
+           CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
+                             &blocks, function, NULL, 0,
+                             dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
+           GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
+                              "grid = %d, block = %d\n", grids, blocks);
+
+           /* Keep the num_gangs proportional to the block size.  In
+              the case were a block size is limited by shared-memory
+              or the register file capacity, the runtime will not
+              excessively over assign gangs to the multiprocessor
+              units if their state is going to be swapped out even
+              more than necessary. The constant factor 2 is there to
+              prevent threads from idling when there is insufficient
+              work for them.  */
+           if (gangs == 0)
+             gangs = 2 * grids * (blocks / warp_size);
+
+           if (vectors == 0)
+             vectors = warp_size;
+
+           if (workers == 0)
+             {
+               int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
+                                     ? vectors
+                                     : dims[GOMP_DIM_VECTOR]);
+               workers = blocks / actual_vectors;
+             }
  
-       if (default_dim_p[GOMP_DIM_VECTOR])
-         dims[GOMP_DIM_VECTOR]
-           = MIN (dims[GOMP_DIM_VECTOR],
-                  (targ_fn->max_threads_per_block / warp_size * warp_size));
-
-       if (default_dim_p[GOMP_DIM_WORKER])
-         dims[GOMP_DIM_WORKER]
-           = MIN (dims[GOMP_DIM_WORKER],
-                  targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
+           for (i = 0; i != GOMP_DIM_MAX; i++)
+             if (default_dim_p[i])
+               switch (i)
+                 {
+                 case GOMP_DIM_GANG: dims[i] = gangs; break;
+                 case GOMP_DIM_WORKER: dims[i] = workers; break;
+                 case GOMP_DIM_VECTOR: dims[i] = vectors; break;
+                 default: GOMP_PLUGIN_fatal ("invalid dim");
+                 }
+         }
        }
      }
author	Cesar Philippidis <cesar@codesourcery.com>
	Mon, 13 Aug 2018 12:04:24 +0000 (05:04 -0700)
committer	Tom de Vries <vries@gcc.gnu.org>
	Mon, 13 Aug 2018 12:04:24 +0000 (12:04 +0000)
libgomp/ChangeLog		patch \| blob \| history
libgomp/plugin/cuda-lib.def		patch \| blob \| history
libgomp/plugin/cuda/cuda.h		patch \| blob \| history
libgomp/plugin/plugin-nvptx.c		patch \| blob \| history