nvptx.c (PTX_GANG_DEFAULT): Set to zero.

author Cesar Philippidis <cesar@codesourcery.com>

Wed, 2 Nov 2016 22:10:02 +0000 (15:10 -0700)

committer Cesar Philippidis <cesar@gcc.gnu.org>

Wed, 2 Nov 2016 22:10:02 +0000 (15:10 -0700)
author Cesar Philippidis <cesar@codesourcery.com>
Wed, 2 Nov 2016 22:10:02 +0000 (15:10 -0700)
committer Cesar Philippidis <cesar@gcc.gnu.org>
Wed, 2 Nov 2016 22:10:02 +0000 (15:10 -0700)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 5e98b98028348db1b3fe8ed2c6bf620e5c6f5936..2debfb68d8a5d198376488bcb9285fc1d8e44e8e 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2016-11-02  Cesar Philippidis  <cesar@codesourcery.com>
+           Nathan Sidwell  <nathan@acm.org>
+
+       * config/nvptx/nvptx.c (PTX_GANG_DEFAULT): Set to zero.
+
  2016-11-02  Max Filippov  <jcmvbkbc@gmail.com>
  
         * config/xtensa/xtensa.c (xtensa_output_integer_literal_parts):
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c

index 80fa9ae2f1bd84c397513cc513b9f37953414a71..782bbdecb37d648b717d53450ba8dc8c276c7207 100644 (file)
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4174,7 +4174,7 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
  /* Define dimension sizes for known hardware.  */
  #define PTX_VECTOR_LENGTH 32
  #define PTX_WORKER_LENGTH 32
-#define PTX_GANG_DEFAULT  32
+#define PTX_GANG_DEFAULT  0 /* Defer to runtime.  */
  
  /* Validate compute dimensions of an OpenACC offload or routine, fill
     in non-unity defaults.  FN_LEVEL indicates the level at which a
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog

index 675c63ec1cdc525230d011a01bd1cebb20361413..a7a52f82f0cbd2b15c402638fcf7b75e81ff0a99 100644 (file)
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,11 @@
+2016-11-02  Cesar Philippidis  <cesar@codesourcery.com>
+           Nathan Sidwell  <nathan@acm.org>
+
+       * plugin/plugin-nvptx.c (nvptx_exec): Interrogate board attributes
+       to determine default geometry.
+       * testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Set gang
+       dimension.
+
  2016-11-01  Jakub Jelinek  <jakub@redhat.com>
  
         * hashtab.h: Use standard GPLv3 with runtime exception
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c

index 327500c01aa72bfbcbb6d47d11acfeb3451e4d87..5ee350d4c1d1d8cbc287dd39c9ffc28b741b035c 100644 (file)
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -45,6 +45,7 @@
  #include <stdio.h>
  #include <unistd.h>
  #include <assert.h>
+#include <errno.h>
  
  static const char *
  cuda_error (CUresult r)
@@ -932,9 +933,88 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
  
    if (seen_zero)
      {
+      /* See if the user provided GOMP_OPENACC_DIM environment
+        variable to specify runtime defaults. */
+      static int default_dims[GOMP_DIM_MAX];
+
+      pthread_mutex_lock (&ptx_dev_lock);
+      if (!default_dims[0])
+       {
+         /* We only read the environment variable once.  You can't
+            change it in the middle of execution.  The syntax  is
+            the same as for the -fopenacc-dim compilation option.  */
+         const char *env_var = getenv ("GOMP_OPENACC_DIM");
+         if (env_var)
+           {
+             const char *pos = env_var;
+
+             for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
+               {
+                 if (i && *pos++ != ':')
+                   break;
+                 if (*pos != ':')
+                   {
+                     const char *eptr;
+
+                     errno = 0;
+                     long val = strtol (pos, (char **)&eptr, 10);
+                     if (errno || val < 0 || (unsigned)val != val)
+                       break;
+                     default_dims[i] = (int)val;
+                     pos = eptr;
+                   }
+               }
+           }
+
+         int warp_size, block_size, dev_size, cpu_size;
+         CUdevice dev = nvptx_thread()->ptx_dev->dev;
+         /* 32 is the default for known hardware.  */
+         int gang = 0, worker = 32, vector = 32;
+         CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
+
+         cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
+         cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
+         cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
+         cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
+
+         if (cuDeviceGetAttribute (&block_size, cu_tpb, dev) == CUDA_SUCCESS
+             && cuDeviceGetAttribute (&warp_size, cu_ws, dev) == CUDA_SUCCESS
+             && cuDeviceGetAttribute (&dev_size, cu_mpc, dev) == CUDA_SUCCESS
+             && cuDeviceGetAttribute (&cpu_size, cu_tpm, dev)  == CUDA_SUCCESS)
+           {
+             GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
+                                " dev_size=%d, cpu_size=%d\n",
+                                warp_size, block_size, dev_size, cpu_size);
+             gang = (cpu_size / block_size) * dev_size;
+             worker = block_size / warp_size;
+             vector = warp_size;
+           }
+
+         /* There is no upper bound on the gang size.  The best size
+            matches the hardware configuration.  Logical gangs are
+            scheduled onto physical hardware.  To maximize usage, we
+            should guess a large number.  */
+         if (default_dims[GOMP_DIM_GANG] < 1)
+           default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
+         /* The worker size must not exceed the hardware.  */
+         if (default_dims[GOMP_DIM_WORKER] < 1
+             || (default_dims[GOMP_DIM_WORKER] > worker && gang))
+           default_dims[GOMP_DIM_WORKER] = worker;
+         /* The vector size must exactly match the hardware.  */
+         if (default_dims[GOMP_DIM_VECTOR] < 1
+             || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
+           default_dims[GOMP_DIM_VECTOR] = vector;
+
+         GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
+                            default_dims[GOMP_DIM_GANG],
+                            default_dims[GOMP_DIM_WORKER],
+                            default_dims[GOMP_DIM_VECTOR]);
+       }
+      pthread_mutex_unlock (&ptx_dev_lock);
+
        for (i = 0; i != GOMP_DIM_MAX; i++)
-       if (!dims[i])
-         dims[i] = /* TODO */ 32;
+       if (!dims[i])
+         dims[i] = default_dims[i];
      }
  
    /* This reserves a chunk of a pre-allocated page of memory mapped on both
@@ -954,8 +1034,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
                     mapnum * sizeof (void *));
    GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
                      " gangs=%u, workers=%u, vectors=%u\n",
-                    __FUNCTION__, targ_fn->launch->fn,
-                    dims[0], dims[1], dims[2]);
+                    __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
+                    dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
  
    // OpenACC           CUDA
    //
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c

index 8a755b88038d46eeb2986a3f39f93b41a8317929..3ca9388d405181c191319ccbd8099710c2151437 100644 (file)
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
@@ -2,6 +2,8 @@
     not optimized away at -O0, and then confuses the target assembler.
     { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
  
+/* { dg-additional-options "-fopenacc-dim=32" } */
+
  #include <stdio.h>
  #include <openacc.h>
author	Cesar Philippidis <cesar@codesourcery.com>
	Wed, 2 Nov 2016 22:10:02 +0000 (15:10 -0700)
committer	Cesar Philippidis <cesar@gcc.gnu.org>
	Wed, 2 Nov 2016 22:10:02 +0000 (15:10 -0700)
gcc/ChangeLog		patch \| blob \| history
gcc/config/nvptx/nvptx.c		patch \| blob \| history
libgomp/ChangeLog		patch \| blob \| history
libgomp/plugin/plugin-nvptx.c		patch \| blob \| history
libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c		patch \| blob \| history