libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2021 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #define _GNU_SOURCE
  35 #include "openacc.h"
  36 #include "config.h"
  37 #include "libgomp-plugin.h"
  38 #include "oacc-plugin.h"
  39 #include "gomp-constants.h"
  40 #include "oacc-int.h"
  41
  42 #include <pthread.h>
  43 #include <cuda.h>
  44 #include <stdbool.h>
  45 #include <limits.h>
  46 #include <string.h>
  47 #include <stdio.h>
  48 #include <unistd.h>
  49 #include <assert.h>
  50 #include <errno.h>
  51
  52 /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
  53    block to cache between kernel invocations.  For soft-stacks blocks bigger
  54    than this, we will free the block before attempting another GPU memory
  55    allocation (i.e. in GOMP_OFFLOAD_alloc).  Otherwise, if an allocation fails,
  56    we will free the cached soft-stacks block anyway then retry the
  57    allocation.  If that fails too, we lose.  */
  58
  59 #define SOFTSTACK_CACHE_LIMIT 134217728
  60
  61 #if CUDA_VERSION < 6000
  62 extern CUresult cuGetErrorString (CUresult, const char **);
  63 #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
  64 #endif
  65
  66 #if CUDA_VERSION >= 6050
  67 #undef cuLinkCreate
  68 #undef cuLinkAddData
  69 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
  70                         const char *, unsigned, CUjit_option *, void **);
  71 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
  72 #else
  73 typedef size_t (*CUoccupancyB2DSize)(int);
  74 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
  75                            const char *, unsigned, CUjit_option *, void **);
  76 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
  77 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
  78                                           CUoccupancyB2DSize, size_t, int);
  79 #endif
  80
  81 #define DO_PRAGMA(x) _Pragma (#x)
  82
  83 #if PLUGIN_NVPTX_DYNAMIC
  84 # include <dlfcn.h>
  85
  86 struct cuda_lib_s {
  87
  88 # define CUDA_ONE_CALL(call)                    \
  89   __typeof (call) *call;
  90 # define CUDA_ONE_CALL_MAYBE_NULL(call)         \
  91   CUDA_ONE_CALL (call)
  92 #include "cuda-lib.def"
  93 # undef CUDA_ONE_CALL
  94 # undef CUDA_ONE_CALL_MAYBE_NULL
  95
  96 } cuda_lib;
  97
  98 /* -1 if init_cuda_lib has not been called yet, false
  99    if it has been and failed, true if it has been and succeeded.  */
 100 static signed char cuda_lib_inited = -1;
 101
 102 /* Dynamically load the CUDA runtime library and initialize function
 103    pointers, return false if unsuccessful, true if successful.  */
 104 static bool
 105 init_cuda_lib (void)
 106 {
 107   if (cuda_lib_inited != -1)
 108     return cuda_lib_inited;
 109   const char *cuda_runtime_lib = "libcuda.so.1";
 110   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
 111   cuda_lib_inited = false;
 112   if (h == NULL)
 113     return false;
 114
 115 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
 116 # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
 117 # define CUDA_ONE_CALL_1(call, allow_null)              \
 118   cuda_lib.call = dlsym (h, #call);     \
 119   if (!allow_null && cuda_lib.call == NULL)             \
 120     return false;
 121 #include "cuda-lib.def"
 122 # undef CUDA_ONE_CALL
 123 # undef CUDA_ONE_CALL_1
 124 # undef CUDA_ONE_CALL_MAYBE_NULL
 125
 126   cuda_lib_inited = true;
 127   return true;
 128 }
 129 # define CUDA_CALL_PREFIX cuda_lib.
 130 #else
 131
 132 # define CUDA_ONE_CALL(call)
 133 # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
 134 #include "cuda-lib.def"
 135 #undef CUDA_ONE_CALL_MAYBE_NULL
 136 #undef CUDA_ONE_CALL
 137
 138 # define CUDA_CALL_PREFIX
 139 # define init_cuda_lib() true
 140 #endif
 141
 142 #include "secure_getenv.h"
 143
 144 #undef MIN
 145 #undef MAX
 146 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
 147 #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
 148
 149 /* Convenience macros for the frequently used CUDA library call and
 150    error handling sequence as well as CUDA library calls that
 151    do the error checking themselves or don't do it at all.  */
 152
 153 #define CUDA_CALL_ERET(ERET, FN, ...)           \
 154   do {                                          \
 155     unsigned __r                                \
 156       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 157     if (__r != CUDA_SUCCESS)                    \
 158       {                                         \
 159         GOMP_PLUGIN_error (#FN " error: %s",    \
 160                            cuda_error (__r));   \
 161         return ERET;                            \
 162       }                                         \
 163   } while (0)
 164
 165 #define CUDA_CALL(FN, ...)                      \
 166   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
 167
 168 #define CUDA_CALL_ASSERT(FN, ...)               \
 169   do {                                          \
 170     unsigned __r                                \
 171       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 172     if (__r != CUDA_SUCCESS)                    \
 173       {                                         \
 174         GOMP_PLUGIN_fatal (#FN " error: %s",    \
 175                            cuda_error (__r));   \
 176       }                                         \
 177   } while (0)
 178
 179 #define CUDA_CALL_NOCHECK(FN, ...)              \
 180   CUDA_CALL_PREFIX FN (__VA_ARGS__)
 181
 182 #define CUDA_CALL_EXISTS(FN)                    \
 183   CUDA_CALL_PREFIX FN
 184
 185 static const char *
 186 cuda_error (CUresult r)
 187 {
 188   const char *fallback = "unknown cuda error";
 189   const char *desc;
 190
 191   if (!CUDA_CALL_EXISTS (cuGetErrorString))
 192     return fallback;
 193
 194   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
 195   if (r == CUDA_SUCCESS)
 196     return desc;
 197
 198   return fallback;
 199 }
 200
 201 /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
 202    Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
 203 static char cuda_driver_version_s[30];
 204
 205 static unsigned int instantiated_devices = 0;
 206 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 207
 208 /* NVPTX/CUDA specific definition of asynchronous queues.  */
 209 struct goacc_asyncqueue
 210 {
 211   CUstream cuda_stream;
 212 };
 213
 214 struct nvptx_callback
 215 {
 216   void (*fn) (void *);
 217   void *ptr;
 218   struct goacc_asyncqueue *aq;
 219   struct nvptx_callback *next;
 220 };
 221
 222 /* Thread-specific data for PTX.  */
 223
 224 struct nvptx_thread
 225 {
 226   /* We currently have this embedded inside the plugin because libgomp manages
 227      devices through integer target_ids.  This might be better if using an
 228      opaque target-specific pointer directly from gomp_device_descr.  */
 229   struct ptx_device *ptx_dev;
 230 };
 231
 232 /* Target data function launch information.  */
 233
 234 struct targ_fn_launch
 235 {
 236   const char *fn;
 237   unsigned short dim[GOMP_DIM_MAX];
 238 };
 239
 240 /* Target PTX object information.  */
 241
 242 struct targ_ptx_obj
 243 {
 244   const char *code;
 245   size_t size;
 246 };
 247
 248 /* Target data image information.  */
 249
 250 typedef struct nvptx_tdata
 251 {
 252   const struct targ_ptx_obj *ptx_objs;
 253   unsigned ptx_num;
 254
 255   const char *const *var_names;
 256   unsigned var_num;
 257
 258   const struct targ_fn_launch *fn_descs;
 259   unsigned fn_num;
 260 } nvptx_tdata_t;
 261
 262 /* Descriptor of a loaded function.  */
 263
 264 struct targ_fn_descriptor
 265 {
 266   CUfunction fn;
 267   const struct targ_fn_launch *launch;
 268   int regs_per_thread;
 269   int max_threads_per_block;
 270 };
 271
 272 /* A loaded PTX image.  */
 273 struct ptx_image_data
 274 {
 275   const void *target_data;
 276   CUmodule module;
 277
 278   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 279
 280   struct ptx_image_data *next;
 281 };
 282
 283 struct ptx_free_block
 284 {
 285   void *ptr;
 286   struct ptx_free_block *next;
 287 };
 288
 289 struct ptx_device
 290 {
 291   CUcontext ctx;
 292   bool ctx_shared;
 293   CUdevice dev;
 294
 295   int ord;
 296   bool overlap;
 297   bool map;
 298   bool concur;
 299   bool mkern;
 300   int mode;
 301   int clock_khz;
 302   int num_sms;
 303   int regs_per_block;
 304   int regs_per_sm;
 305   int warp_size;
 306   int max_threads_per_block;
 307   int max_threads_per_multiprocessor;
 308   int default_dims[GOMP_DIM_MAX];
 309
 310   /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp').  */
 311   char name[256];
 312
 313   struct ptx_image_data *images;  /* Images loaded on device.  */
 314   pthread_mutex_t image_lock;     /* Lock for above list.  */
 315
 316   struct ptx_free_block *free_blocks;
 317   pthread_mutex_t free_blocks_lock;
 318
 319   /* OpenMP stacks, cached between kernel invocations.  */
 320   struct
 321     {
 322       CUdeviceptr ptr;
 323       size_t size;
 324       pthread_mutex_t lock;
 325     } omp_stacks;
 326
 327   struct ptx_device *next;
 328 };
 329
 330 static struct ptx_device **ptx_devices;
 331
 332 static inline struct nvptx_thread *
 333 nvptx_thread (void)
 334 {
 335   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 336 }
 337
 338 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 339    should be locked on entry and remains locked on exit.  */
 340
 341 static bool
 342 nvptx_init (void)
 343 {
 344   int ndevs;
 345
 346   if (instantiated_devices != 0)
 347     return true;
 348
 349   if (!init_cuda_lib ())
 350     return false;
 351
 352   CUDA_CALL (cuInit, 0);
 353
 354   int cuda_driver_version;
 355   CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
 356   snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
 357             "CUDA Driver %u.%u",
 358             cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
 359
 360   CUDA_CALL (cuDeviceGetCount, &ndevs);
 361   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 362                                             * ndevs);
 363
 364   return true;
 365 }
 366
 367 /* Select the N'th PTX device for the current host thread.  The device must
 368    have been previously opened before calling this function.  */
 369
 370 static bool
 371 nvptx_attach_host_thread_to_device (int n)
 372 {
 373   CUdevice dev;
 374   CUresult r;
 375   struct ptx_device *ptx_dev;
 376   CUcontext thd_ctx;
 377
 378   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
 379   if (r == CUDA_ERROR_NOT_PERMITTED)
 380     {
 381       /* Assume we're in a CUDA callback, just return true.  */
 382       return true;
 383     }
 384   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 385     {
 386       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 387       return false;
 388     }
 389
 390   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 391     return true;
 392   else
 393     {
 394       CUcontext old_ctx;
 395
 396       ptx_dev = ptx_devices[n];
 397       if (!ptx_dev)
 398         {
 399           GOMP_PLUGIN_error ("device %d not found", n);
 400           return false;
 401         }
 402
 403       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 404
 405       /* We don't necessarily have a current context (e.g. if it has been
 406          destroyed.  Pop it if we do though.  */
 407       if (thd_ctx != NULL)
 408         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 409
 410       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 411     }
 412   return true;
 413 }
 414
 415 static struct ptx_device *
 416 nvptx_open_device (int n)
 417 {
 418   struct ptx_device *ptx_dev;
 419   CUdevice dev, ctx_dev;
 420   CUresult r;
 421   int async_engines, pi;
 422
 423   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 424
 425   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 426
 427   ptx_dev->ord = n;
 428   ptx_dev->dev = dev;
 429   ptx_dev->ctx_shared = false;
 430
 431   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
 432   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 433     {
 434       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 435       return NULL;
 436     }
 437
 438   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 439     {
 440       /* The current host thread has an active context for a different device.
 441          Detach it.  */
 442       CUcontext old_ctx;
 443       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 444     }
 445
 446   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 447
 448   if (!ptx_dev->ctx)
 449     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 450   else
 451     ptx_dev->ctx_shared = true;
 452
 453   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 454                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 455   ptx_dev->overlap = pi;
 456
 457   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 458                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 459   ptx_dev->map = pi;
 460
 461   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 462                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 463   ptx_dev->concur = pi;
 464
 465   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 466                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 467   ptx_dev->mode = pi;
 468
 469   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 470                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 471   ptx_dev->mkern = pi;
 472
 473   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 474                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 475   ptx_dev->clock_khz = pi;
 476
 477   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 478                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 479   ptx_dev->num_sms = pi;
 480
 481   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 482                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 483   ptx_dev->regs_per_block = pi;
 484
 485   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
 486      in CUDA 6.0 and newer.  */
 487   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
 488                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
 489                          dev);
 490   /* Fallback: use limit of registers per block, which is usually equal.  */
 491   if (r == CUDA_ERROR_INVALID_VALUE)
 492     pi = ptx_dev->regs_per_block;
 493   else if (r != CUDA_SUCCESS)
 494     {
 495       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 496       return NULL;
 497     }
 498   ptx_dev->regs_per_sm = pi;
 499
 500   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 501                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 502   if (pi != 32)
 503     {
 504       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 505       return NULL;
 506     }
 507   ptx_dev->warp_size = pi;
 508
 509   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 510                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
 511   ptx_dev->max_threads_per_block = pi;
 512
 513   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
 514                   CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
 515   ptx_dev->max_threads_per_multiprocessor = pi;
 516
 517   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
 518                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 519   if (r != CUDA_SUCCESS)
 520     async_engines = 1;
 521
 522   for (int i = 0; i != GOMP_DIM_MAX; i++)
 523     ptx_dev->default_dims[i] = 0;
 524
 525   CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
 526                   dev);
 527
 528   ptx_dev->images = NULL;
 529   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 530
 531   ptx_dev->free_blocks = NULL;
 532   pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
 533
 534   ptx_dev->omp_stacks.ptr = 0;
 535   ptx_dev->omp_stacks.size = 0;
 536   pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
 537
 538   return ptx_dev;
 539 }
 540
 541 static bool
 542 nvptx_close_device (struct ptx_device *ptx_dev)
 543 {
 544   if (!ptx_dev)
 545     return true;
 546
 547   for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
 548     {
 549       struct ptx_free_block *b_next = b->next;
 550       CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
 551       free (b);
 552       b = b_next;
 553     }
 554
 555   pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
 556   pthread_mutex_destroy (&ptx_dev->image_lock);
 557
 558   pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
 559
 560   if (ptx_dev->omp_stacks.ptr)
 561     CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
 562
 563   if (!ptx_dev->ctx_shared)
 564     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 565
 566   free (ptx_dev);
 567   return true;
 568 }
 569
 570 static int
 571 nvptx_get_num_devices (void)
 572 {
 573   int n;
 574
 575   /* This function will be called before the plugin has been initialized in
 576      order to enumerate available devices, but CUDA API routines can't be used
 577      until cuInit has been called.  Just call it now (but don't yet do any
 578      further initialization).  */
 579   if (instantiated_devices == 0)
 580     {
 581       if (!init_cuda_lib ())
 582         return 0;
 583       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
 584       /* This is not an error: e.g. we may have CUDA libraries installed but
 585          no devices available.  */
 586       if (r != CUDA_SUCCESS)
 587         {
 588           GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
 589                              cuda_error (r));
 590           return 0;
 591         }
 592     }
 593
 594   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 595   return n;
 596 }
 597
 598 static void
 599 notify_var (const char *var_name, const char *env_var)
 600 {
 601   if (env_var == NULL)
 602     GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
 603   else
 604     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
 605 }
 606
 607 static void
 608 process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
 609 {
 610   const char *var_name = "GOMP_NVPTX_JIT";
 611   const char *env_var = secure_getenv (var_name);
 612   notify_var (var_name, env_var);
 613
 614   if (env_var == NULL)
 615     return;
 616
 617   const char *c = env_var;
 618   while (*c != '\0')
 619     {
 620       while (*c == ' ')
 621         c++;
 622
 623       if (c[0] == '-' && c[1] == 'O'
 624           && '0' <= c[2] && c[2] <= '4'
 625           && (c[3] == '\0' || c[3] == ' '))
 626         {
 627           *gomp_nvptx_o = c[2] - '0';
 628           c += 3;
 629           continue;
 630         }
 631
 632       GOMP_PLUGIN_error ("Error parsing %s", var_name);
 633       break;
 634     }
 635 }
 636
 637 static bool
 638 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 639           unsigned num_objs)
 640 {
 641   CUjit_option opts[7];
 642   void *optvals[7];
 643   float elapsed = 0.0;
 644   char elog[1024];
 645   char ilog[16384];
 646   CUlinkState linkstate;
 647   CUresult r;
 648   void *linkout;
 649   size_t linkoutsize __attribute__ ((unused));
 650
 651   opts[0] = CU_JIT_WALL_TIME;
 652   optvals[0] = &elapsed;
 653
 654   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 655   optvals[1] = &ilog[0];
 656
 657   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 658   optvals[2] = (void *) sizeof ilog;
 659
 660   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 661   optvals[3] = &elog[0];
 662
 663   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 664   optvals[4] = (void *) sizeof elog;
 665
 666   opts[5] = CU_JIT_LOG_VERBOSE;
 667   optvals[5] = (void *) 1;
 668
 669   static intptr_t gomp_nvptx_o = -1;
 670
 671   static bool init_done = false;
 672   if (!init_done)
 673     {
 674       process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
 675       init_done = true;
 676   }
 677
 678   int nopts = 6;
 679   if (gomp_nvptx_o != -1)
 680     {
 681       opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
 682       optvals[nopts] = (void *) gomp_nvptx_o;
 683       nopts++;
 684     }
 685
 686   if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
 687     CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
 688   else
 689     CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
 690
 691   for (; num_objs--; ptx_objs++)
 692     {
 693       /* cuLinkAddData's 'data' argument erroneously omits the const
 694          qualifier.  */
 695       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 696       if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
 697         r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
 698                                (char *) ptx_objs->code, ptx_objs->size,
 699                                0, 0, 0, 0);
 700       else
 701         r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
 702                                (char *) ptx_objs->code, ptx_objs->size,
 703                                0, 0, 0, 0);
 704       if (r != CUDA_SUCCESS)
 705         {
 706           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 707           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 708                              cuda_error (r));
 709           return false;
 710         }
 711     }
 712
 713   GOMP_PLUGIN_debug (0, "Linking\n");
 714   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 715
 716   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 717   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 718
 719   if (r != CUDA_SUCCESS)
 720     {
 721       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 722       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 723       return false;
 724     }
 725
 726   CUDA_CALL (cuModuleLoadData, module, linkout);
 727   CUDA_CALL (cuLinkDestroy, linkstate);
 728   return true;
 729 }
 730
 731 static void
 732 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 733             unsigned *dims, void *targ_mem_desc,
 734             CUdeviceptr dp, CUstream stream)
 735 {
 736   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
 737   CUfunction function;
 738   int i;
 739   void *kargs[1];
 740   struct nvptx_thread *nvthd = nvptx_thread ();
 741   int warp_size = nvthd->ptx_dev->warp_size;
 742
 743   function = targ_fn->fn;
 744
 745   /* Initialize the launch dimensions.  Typically this is constant,
 746      provided by the device compiler, but we must permit runtime
 747      values.  */
 748   int seen_zero = 0;
 749   for (i = 0; i != GOMP_DIM_MAX; i++)
 750     {
 751       if (targ_fn->launch->dim[i])
 752        dims[i] = targ_fn->launch->dim[i];
 753       if (!dims[i])
 754        seen_zero = 1;
 755     }
 756
 757   if (seen_zero)
 758     {
 759       pthread_mutex_lock (&ptx_dev_lock);
 760
 761       static int gomp_openacc_dims[GOMP_DIM_MAX];
 762       if (!gomp_openacc_dims[0])
 763         {
 764           /* See if the user provided GOMP_OPENACC_DIM environment
 765              variable to specify runtime defaults.  */
 766           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 767             gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
 768         }
 769
 770       if (!nvthd->ptx_dev->default_dims[0])
 771         {
 772           int default_dims[GOMP_DIM_MAX];
 773           for (int i = 0; i < GOMP_DIM_MAX; ++i)
 774             default_dims[i] = gomp_openacc_dims[i];
 775
 776           int gang, worker, vector;
 777           {
 778             int block_size = nvthd->ptx_dev->max_threads_per_block;
 779             int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
 780             int dev_size = nvthd->ptx_dev->num_sms;
 781             GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
 782                                " dev_size=%d, cpu_size=%d\n",
 783                                warp_size, block_size, dev_size, cpu_size);
 784
 785             gang = (cpu_size / block_size) * dev_size;
 786             worker = block_size / warp_size;
 787             vector = warp_size;
 788           }
 789
 790           /* There is no upper bound on the gang size.  The best size
 791              matches the hardware configuration.  Logical gangs are
 792              scheduled onto physical hardware.  To maximize usage, we
 793              should guess a large number.  */
 794           if (default_dims[GOMP_DIM_GANG] < 1)
 795             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
 796           /* The worker size must not exceed the hardware.  */
 797           if (default_dims[GOMP_DIM_WORKER] < 1
 798               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
 799             default_dims[GOMP_DIM_WORKER] = worker;
 800           /* The vector size must exactly match the hardware.  */
 801           if (default_dims[GOMP_DIM_VECTOR] < 1
 802               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
 803             default_dims[GOMP_DIM_VECTOR] = vector;
 804
 805           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
 806                              default_dims[GOMP_DIM_GANG],
 807                              default_dims[GOMP_DIM_WORKER],
 808                              default_dims[GOMP_DIM_VECTOR]);
 809
 810           for (i = 0; i != GOMP_DIM_MAX; i++)
 811             nvthd->ptx_dev->default_dims[i] = default_dims[i];
 812         }
 813       pthread_mutex_unlock (&ptx_dev_lock);
 814
 815       {
 816         bool default_dim_p[GOMP_DIM_MAX];
 817         for (i = 0; i != GOMP_DIM_MAX; i++)
 818           default_dim_p[i] = !dims[i];
 819
 820         if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
 821           {
 822             for (i = 0; i != GOMP_DIM_MAX; i++)
 823               if (default_dim_p[i])
 824                 dims[i] = nvthd->ptx_dev->default_dims[i];
 825
 826             if (default_dim_p[GOMP_DIM_VECTOR])
 827               dims[GOMP_DIM_VECTOR]
 828                 = MIN (dims[GOMP_DIM_VECTOR],
 829                        (targ_fn->max_threads_per_block / warp_size
 830                         * warp_size));
 831
 832             if (default_dim_p[GOMP_DIM_WORKER])
 833               dims[GOMP_DIM_WORKER]
 834                 = MIN (dims[GOMP_DIM_WORKER],
 835                        targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
 836           }
 837         else
 838           {
 839             /* Handle the case that the compiler allows the runtime to choose
 840                the vector-length conservatively, by ignoring
 841                gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
 842                it.  */
 843             int vectors = 0;
 844             /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
 845                gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
 846                exceed targ_fn->max_threads_per_block. */
 847             int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
 848             int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
 849             int grids, blocks;
 850
 851             CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
 852                               &blocks, function, NULL, 0,
 853                               dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
 854             GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
 855                                "grid = %d, block = %d\n", grids, blocks);
 856
 857             /* Keep the num_gangs proportional to the block size.  In
 858                the case were a block size is limited by shared-memory
 859                or the register file capacity, the runtime will not
 860                excessively over assign gangs to the multiprocessor
 861                units if their state is going to be swapped out even
 862                more than necessary. The constant factor 2 is there to
 863                prevent threads from idling when there is insufficient
 864                work for them.  */
 865             if (gangs == 0)
 866               gangs = 2 * grids * (blocks / warp_size);
 867
 868             if (vectors == 0)
 869               vectors = warp_size;
 870
 871             if (workers == 0)
 872               {
 873                 int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
 874                                       ? vectors
 875                                       : dims[GOMP_DIM_VECTOR]);
 876                 workers = blocks / actual_vectors;
 877                 workers = MAX (workers, 1);
 878                 /* If we need a per-worker barrier ... .  */
 879                 if (actual_vectors > 32)
 880                   /* Don't use more barriers than available.  */
 881                   workers = MIN (workers, 15);
 882               }
 883
 884             for (i = 0; i != GOMP_DIM_MAX; i++)
 885               if (default_dim_p[i])
 886                 switch (i)
 887                   {
 888                   case GOMP_DIM_GANG: dims[i] = gangs; break;
 889                   case GOMP_DIM_WORKER: dims[i] = workers; break;
 890                   case GOMP_DIM_VECTOR: dims[i] = vectors; break;
 891                   default: GOMP_PLUGIN_fatal ("invalid dim");
 892                   }
 893           }
 894       }
 895     }
 896
 897   /* Check if the accelerator has sufficient hardware resources to
 898      launch the offloaded kernel.  */
 899   if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
 900       > targ_fn->max_threads_per_block)
 901     {
 902       const char *msg
 903         = ("The Nvidia accelerator has insufficient resources to launch '%s'"
 904            " with num_workers = %d and vector_length = %d"
 905            "; "
 906            "recompile the program with 'num_workers = x and vector_length = y'"
 907            " on that offloaded region or '-fopenacc-dim=:x:y' where"
 908            " x * y <= %d"
 909            ".\n");
 910       GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
 911                          dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
 912     }
 913
 914   /* Check if the accelerator has sufficient barrier resources to
 915      launch the offloaded kernel.  */
 916   if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
 917     {
 918       const char *msg
 919         = ("The Nvidia accelerator has insufficient barrier resources to launch"
 920            " '%s' with num_workers = %d and vector_length = %d"
 921            "; "
 922            "recompile the program with 'num_workers = x' on that offloaded"
 923            " region or '-fopenacc-dim=:x:' where x <= 15"
 924            "; "
 925            "or, recompile the program with 'vector_length = 32' on that"
 926            " offloaded region or '-fopenacc-dim=::32'"
 927            ".\n");
 928         GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
 929                            dims[GOMP_DIM_VECTOR]);
 930     }
 931
 932   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 933                      " gangs=%u, workers=%u, vectors=%u\n",
 934                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
 935                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
 936
 937   // OpenACC            CUDA
 938   //
 939   // num_gangs          nctaid.x
 940   // num_workers        ntid.y
 941   // vector length      ntid.x
 942
 943   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
 944   acc_prof_info *prof_info = thr->prof_info;
 945   acc_event_info enqueue_launch_event_info;
 946   acc_api_info *api_info = thr->api_info;
 947   bool profiling_p = __builtin_expect (prof_info != NULL, false);
 948   if (profiling_p)
 949     {
 950       prof_info->event_type = acc_ev_enqueue_launch_start;
 951
 952       enqueue_launch_event_info.launch_event.event_type
 953         = prof_info->event_type;
 954       enqueue_launch_event_info.launch_event.valid_bytes
 955         = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
 956       enqueue_launch_event_info.launch_event.parent_construct
 957         = acc_construct_parallel;
 958       enqueue_launch_event_info.launch_event.implicit = 1;
 959       enqueue_launch_event_info.launch_event.tool_info = NULL;
 960       enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
 961       enqueue_launch_event_info.launch_event.num_gangs
 962         = dims[GOMP_DIM_GANG];
 963       enqueue_launch_event_info.launch_event.num_workers
 964         = dims[GOMP_DIM_WORKER];
 965       enqueue_launch_event_info.launch_event.vector_length
 966         = dims[GOMP_DIM_VECTOR];
 967
 968       api_info->device_api = acc_device_api_cuda;
 969
 970       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
 971                                             api_info);
 972     }
 973
 974   kargs[0] = &dp;
 975   CUDA_CALL_ASSERT (cuLaunchKernel, function,
 976                     dims[GOMP_DIM_GANG], 1, 1,
 977                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
 978                     0, stream, kargs, 0);
 979
 980   if (profiling_p)
 981     {
 982       prof_info->event_type = acc_ev_enqueue_launch_end;
 983       enqueue_launch_event_info.launch_event.event_type
 984         = prof_info->event_type;
 985       GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
 986                                             api_info);
 987     }
 988
 989   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
 990                      targ_fn->launch->fn);
 991 }
 992
 993 void * openacc_get_current_cuda_context (void);
 994
 995 static void
 996 goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
 997 {
 998   acc_prof_info *prof_info = thr->prof_info;
 999   acc_event_info data_event_info;
1000   acc_api_info *api_info = thr->api_info;
1001
1002   prof_info->event_type = acc_ev_alloc;
1003
1004   data_event_info.data_event.event_type = prof_info->event_type;
1005   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1006   data_event_info.data_event.parent_construct = acc_construct_parallel;
1007   data_event_info.data_event.implicit = 1;
1008   data_event_info.data_event.tool_info = NULL;
1009   data_event_info.data_event.var_name = NULL;
1010   data_event_info.data_event.bytes = s;
1011   data_event_info.data_event.host_ptr = NULL;
1012   data_event_info.data_event.device_ptr = dp;
1013
1014   api_info->device_api = acc_device_api_cuda;
1015
1016   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1017 }
1018
1019 /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
1020    size threshold, or if FORCE is true.  */
1021
1022 static void
1023 nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
1024 {
1025   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1026   if (ptx_dev->omp_stacks.ptr
1027       && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
1028     {
1029       CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1030       if (r != CUDA_SUCCESS)
1031         GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1032       ptx_dev->omp_stacks.ptr = 0;
1033       ptx_dev->omp_stacks.size = 0;
1034     }
1035   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
1036 }
1037
1038 static void *
1039 nvptx_alloc (size_t s, bool suppress_errors)
1040 {
1041   CUdeviceptr d;
1042
1043   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
1044   if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
1045     return NULL;
1046   else if (r != CUDA_SUCCESS)
1047     {
1048       GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
1049       return NULL;
1050     }
1051
1052   /* NOTE: We only do profiling stuff if the memory allocation succeeds.  */
1053   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1054   bool profiling_p
1055     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1056   if (profiling_p)
1057     goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
1058
1059   return (void *) d;
1060 }
1061
1062 static void
1063 goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
1064 {
1065   acc_prof_info *prof_info = thr->prof_info;
1066   acc_event_info data_event_info;
1067   acc_api_info *api_info = thr->api_info;
1068
1069   prof_info->event_type = acc_ev_free;
1070
1071   data_event_info.data_event.event_type = prof_info->event_type;
1072   data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1073   data_event_info.data_event.parent_construct = acc_construct_parallel;
1074   data_event_info.data_event.implicit = 1;
1075   data_event_info.data_event.tool_info = NULL;
1076   data_event_info.data_event.var_name = NULL;
1077   data_event_info.data_event.bytes = -1;
1078   data_event_info.data_event.host_ptr = NULL;
1079   data_event_info.data_event.device_ptr = p;
1080
1081   api_info->device_api = acc_device_api_cuda;
1082
1083   GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
1084 }
1085
1086 static bool
1087 nvptx_free (void *p, struct ptx_device *ptx_dev)
1088 {
1089   CUdeviceptr pb;
1090   size_t ps;
1091
1092   CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
1093                                   (CUdeviceptr) p);
1094   if (r == CUDA_ERROR_NOT_PERMITTED)
1095     {
1096       /* We assume that this error indicates we are in a CUDA callback context,
1097          where all CUDA calls are not allowed (see cuStreamAddCallback
1098          documentation for description). Arrange to free this piece of device
1099          memory later.  */
1100       struct ptx_free_block *n
1101         = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
1102       n->ptr = p;
1103       pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1104       n->next = ptx_dev->free_blocks;
1105       ptx_dev->free_blocks = n;
1106       pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1107       return true;
1108     }
1109   else if (r != CUDA_SUCCESS)
1110     {
1111       GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
1112       return false;
1113     }
1114   if ((CUdeviceptr) p != pb)
1115     {
1116       GOMP_PLUGIN_error ("invalid device address");
1117       return false;
1118     }
1119
1120   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1121   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1122   bool profiling_p
1123     = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
1124   if (profiling_p)
1125     goacc_profiling_acc_ev_free (thr, p);
1126
1127   return true;
1128 }
1129
1130 static void *
1131 nvptx_get_current_cuda_device (void)
1132 {
1133   struct nvptx_thread *nvthd = nvptx_thread ();
1134
1135   if (!nvthd || !nvthd->ptx_dev)
1136     return NULL;
1137
1138   return &nvthd->ptx_dev->dev;
1139 }
1140
1141 static void *
1142 nvptx_get_current_cuda_context (void)
1143 {
1144   struct nvptx_thread *nvthd = nvptx_thread ();
1145
1146   if (!nvthd || !nvthd->ptx_dev)
1147     return NULL;
1148
1149   return nvthd->ptx_dev->ctx;
1150 }
1151
1152 /* Plugin entry points.  */
1153
1154 const char *
1155 GOMP_OFFLOAD_get_name (void)
1156 {
1157   return "nvptx";
1158 }
1159
1160 unsigned int
1161 GOMP_OFFLOAD_get_caps (void)
1162 {
1163   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1164 }
1165
1166 int
1167 GOMP_OFFLOAD_get_type (void)
1168 {
1169   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1170 }
1171
1172 int
1173 GOMP_OFFLOAD_get_num_devices (void)
1174 {
1175   return nvptx_get_num_devices ();
1176 }
1177
1178 bool
1179 GOMP_OFFLOAD_init_device (int n)
1180 {
1181   struct ptx_device *dev;
1182
1183   pthread_mutex_lock (&ptx_dev_lock);
1184
1185   if (!nvptx_init () || ptx_devices[n] != NULL)
1186     {
1187       pthread_mutex_unlock (&ptx_dev_lock);
1188       return false;
1189     }
1190
1191   dev = nvptx_open_device (n);
1192   if (dev)
1193     {
1194       ptx_devices[n] = dev;
1195       instantiated_devices++;
1196     }
1197
1198   pthread_mutex_unlock (&ptx_dev_lock);
1199
1200   return dev != NULL;
1201 }
1202
1203 bool
1204 GOMP_OFFLOAD_fini_device (int n)
1205 {
1206   pthread_mutex_lock (&ptx_dev_lock);
1207
1208   if (ptx_devices[n] != NULL)
1209     {
1210       if (!nvptx_attach_host_thread_to_device (n)
1211           || !nvptx_close_device (ptx_devices[n]))
1212         {
1213           pthread_mutex_unlock (&ptx_dev_lock);
1214           return false;
1215         }
1216       ptx_devices[n] = NULL;
1217       instantiated_devices--;
1218     }
1219
1220   if (instantiated_devices == 0)
1221     {
1222       free (ptx_devices);
1223       ptx_devices = NULL;
1224     }
1225
1226   pthread_mutex_unlock (&ptx_dev_lock);
1227   return true;
1228 }
1229
1230 /* Return the libgomp version number we're compatible with.  There is
1231    no requirement for cross-version compatibility.  */
1232
1233 unsigned
1234 GOMP_OFFLOAD_version (void)
1235 {
1236   return GOMP_VERSION;
1237 }
1238
1239 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1240
1241 static void
1242 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1243 {
1244   CUdeviceptr dptr;
1245   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1246                                   module, "__nvptx_clocktick");
1247   if (r == CUDA_ERROR_NOT_FOUND)
1248     return;
1249   if (r != CUDA_SUCCESS)
1250     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1251   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1252   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1253                          sizeof (__nvptx_clocktick));
1254   if (r != CUDA_SUCCESS)
1255     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1256 }
1257
1258 /* Load the (partial) program described by TARGET_DATA to device
1259    number ORD.  Allocate and return TARGET_TABLE.  */
1260
1261 int
1262 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1263                          struct addr_pair **target_table)
1264 {
1265   CUmodule module;
1266   const char *const *var_names;
1267   const struct targ_fn_launch *fn_descs;
1268   unsigned int fn_entries, var_entries, i, j;
1269   struct targ_fn_descriptor *targ_fns;
1270   struct addr_pair *targ_tbl;
1271   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1272   struct ptx_image_data *new_image;
1273   struct ptx_device *dev;
1274
1275   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1276     {
1277       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1278                          " (expected %u, received %u)",
1279                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1280       return -1;
1281     }
1282
1283   if (!nvptx_attach_host_thread_to_device (ord)
1284       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1285     return -1;
1286
1287   dev = ptx_devices[ord];
1288
1289   /* The mkoffload utility emits a struct of pointers/integers at the
1290      start of each offload image.  The array of kernel names and the
1291      functions addresses form a one-to-one correspondence.  */
1292
1293   var_entries = img_header->var_num;
1294   var_names = img_header->var_names;
1295   fn_entries = img_header->fn_num;
1296   fn_descs = img_header->fn_descs;
1297
1298   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1299                                  * (fn_entries + var_entries));
1300   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1301                                  * fn_entries);
1302
1303   *target_table = targ_tbl;
1304
1305   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1306   new_image->target_data = target_data;
1307   new_image->module = module;
1308   new_image->fns = targ_fns;
1309
1310   pthread_mutex_lock (&dev->image_lock);
1311   new_image->next = dev->images;
1312   dev->images = new_image;
1313   pthread_mutex_unlock (&dev->image_lock);
1314
1315   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1316     {
1317       CUfunction function;
1318       int nregs, mthrs;
1319
1320       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1321                       fn_descs[i].fn);
1322       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1323                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1324       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1325                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1326
1327       targ_fns->fn = function;
1328       targ_fns->launch = &fn_descs[i];
1329       targ_fns->regs_per_thread = nregs;
1330       targ_fns->max_threads_per_block = mthrs;
1331
1332       targ_tbl->start = (uintptr_t) targ_fns;
1333       targ_tbl->end = targ_tbl->start + 1;
1334     }
1335
1336   for (j = 0; j < var_entries; j++, targ_tbl++)
1337     {
1338       CUdeviceptr var;
1339       size_t bytes;
1340
1341       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1342                       &var, &bytes, module, var_names[j]);
1343
1344       targ_tbl->start = (uintptr_t) var;
1345       targ_tbl->end = targ_tbl->start + bytes;
1346     }
1347
1348   nvptx_set_clocktick (module, dev);
1349
1350   return fn_entries + var_entries;
1351 }
1352
1353 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1354    function descriptors allocated by G_O_load_image.  */
1355
1356 bool
1357 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1358 {
1359   struct ptx_image_data *image, **prev_p;
1360   struct ptx_device *dev = ptx_devices[ord];
1361
1362   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1363     {
1364       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1365                          " (expected %u, received %u)",
1366                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1367       return false;
1368     }
1369
1370   bool ret = true;
1371   pthread_mutex_lock (&dev->image_lock);
1372   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1373     if (image->target_data == target_data)
1374       {
1375         *prev_p = image->next;
1376         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1377           ret = false;
1378         free (image->fns);
1379         free (image);
1380         break;
1381       }
1382   pthread_mutex_unlock (&dev->image_lock);
1383   return ret;
1384 }
1385
1386 void *
1387 GOMP_OFFLOAD_alloc (int ord, size_t size)
1388 {
1389   if (!nvptx_attach_host_thread_to_device (ord))
1390     return NULL;
1391
1392   struct ptx_device *ptx_dev = ptx_devices[ord];
1393   struct ptx_free_block *blocks, *tmp;
1394
1395   pthread_mutex_lock (&ptx_dev->free_blocks_lock);
1396   blocks = ptx_dev->free_blocks;
1397   ptx_dev->free_blocks = NULL;
1398   pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
1399
1400   nvptx_stacks_free (ptx_dev, false);
1401
1402   while (blocks)
1403     {
1404       tmp = blocks->next;
1405       nvptx_free (blocks->ptr, ptx_dev);
1406       free (blocks);
1407       blocks = tmp;
1408     }
1409
1410   void *d = nvptx_alloc (size, true);
1411   if (d)
1412     return d;
1413   else
1414     {
1415       /* Memory allocation failed.  Try freeing the stacks block, and
1416          retrying.  */
1417       nvptx_stacks_free (ptx_dev, true);
1418       return nvptx_alloc (size, false);
1419     }
1420 }
1421
1422 bool
1423 GOMP_OFFLOAD_free (int ord, void *ptr)
1424 {
1425   return (nvptx_attach_host_thread_to_device (ord)
1426           && nvptx_free (ptr, ptx_devices[ord]));
1427 }
1428
1429 void
1430 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *), size_t mapnum,
1431                            void **hostaddrs, void **devaddrs,
1432                            unsigned *dims, void *targ_mem_desc)
1433 {
1434   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1435
1436   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1437   acc_prof_info *prof_info = thr->prof_info;
1438   acc_event_info data_event_info;
1439   acc_api_info *api_info = thr->api_info;
1440   bool profiling_p = __builtin_expect (prof_info != NULL, false);
1441
1442   void **hp = NULL;
1443   CUdeviceptr dp = 0;
1444
1445   if (mapnum > 0)
1446     {
1447       size_t s = mapnum * sizeof (void *);
1448       hp = alloca (s);
1449       for (int i = 0; i < mapnum; i++)
1450         hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1451       CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1452       if (profiling_p)
1453         goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1454     }
1455
1456   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1457      fact have the same value on a unified-memory system).  */
1458   if (mapnum > 0)
1459     {
1460       if (profiling_p)
1461         {
1462           prof_info->event_type = acc_ev_enqueue_upload_start;
1463
1464           data_event_info.data_event.event_type = prof_info->event_type;
1465           data_event_info.data_event.valid_bytes
1466             = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1467           data_event_info.data_event.parent_construct
1468             = acc_construct_parallel;
1469           data_event_info.data_event.implicit = 1; /* Always implicit.  */
1470           data_event_info.data_event.tool_info = NULL;
1471           data_event_info.data_event.var_name = NULL;
1472           data_event_info.data_event.bytes = mapnum * sizeof (void *);
1473           data_event_info.data_event.host_ptr = hp;
1474           data_event_info.data_event.device_ptr = (const void *) dp;
1475
1476           api_info->device_api = acc_device_api_cuda;
1477
1478           GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1479                                                 api_info);
1480         }
1481       CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, (void *) hp,
1482                         mapnum * sizeof (void *));
1483       if (profiling_p)
1484         {
1485           prof_info->event_type = acc_ev_enqueue_upload_end;
1486           data_event_info.data_event.event_type = prof_info->event_type;
1487           GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1488                                                 api_info);
1489         }
1490     }
1491
1492   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1493               dp, NULL);
1494
1495   CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
1496   const char *maybe_abort_msg = "(perhaps abort was called)";
1497   if (r == CUDA_ERROR_LAUNCH_FAILED)
1498     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1499                        maybe_abort_msg);
1500   else if (r != CUDA_SUCCESS)
1501     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1502
1503   CUDA_CALL_ASSERT (cuMemFree, dp);
1504   if (profiling_p)
1505     goacc_profiling_acc_ev_free (thr, (void *) dp);
1506 }
1507
1508 static void
1509 cuda_free_argmem (void *ptr)
1510 {
1511   void **block = (void **) ptr;
1512   nvptx_free (block[0], (struct ptx_device *) block[1]);
1513   free (block);
1514 }
1515
1516 void
1517 GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *), size_t mapnum,
1518                                  void **hostaddrs, void **devaddrs,
1519                                  unsigned *dims, void *targ_mem_desc,
1520                                  struct goacc_asyncqueue *aq)
1521 {
1522   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1523
1524   struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
1525   acc_prof_info *prof_info = thr->prof_info;
1526   acc_event_info data_event_info;
1527   acc_api_info *api_info = thr->api_info;
1528   bool profiling_p = __builtin_expect (prof_info != NULL, false);
1529
1530   void **hp = NULL;
1531   CUdeviceptr dp = 0;
1532   void **block = NULL;
1533
1534   if (mapnum > 0)
1535     {
1536       size_t s = mapnum * sizeof (void *);
1537       block = (void **) GOMP_PLUGIN_malloc (2 * sizeof (void *) + s);
1538       hp = block + 2;
1539       for (int i = 0; i < mapnum; i++)
1540         hp[i] = (devaddrs[i] ? devaddrs[i] : hostaddrs[i]);
1541       CUDA_CALL_ASSERT (cuMemAlloc, &dp, s);
1542       if (profiling_p)
1543         goacc_profiling_acc_ev_alloc (thr, (void *) dp, s);
1544     }
1545
1546   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1547      fact have the same value on a unified-memory system).  */
1548   if (mapnum > 0)
1549     {
1550       if (profiling_p)
1551         {
1552           prof_info->event_type = acc_ev_enqueue_upload_start;
1553
1554           data_event_info.data_event.event_type = prof_info->event_type;
1555           data_event_info.data_event.valid_bytes
1556             = _ACC_DATA_EVENT_INFO_VALID_BYTES;
1557           data_event_info.data_event.parent_construct
1558             = acc_construct_parallel;
1559           data_event_info.data_event.implicit = 1; /* Always implicit.  */
1560           data_event_info.data_event.tool_info = NULL;
1561           data_event_info.data_event.var_name = NULL;
1562           data_event_info.data_event.bytes = mapnum * sizeof (void *);
1563           data_event_info.data_event.host_ptr = hp;
1564           data_event_info.data_event.device_ptr = (const void *) dp;
1565
1566           api_info->device_api = acc_device_api_cuda;
1567
1568           GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1569                                                 api_info);
1570         }
1571
1572       CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, dp, (void *) hp,
1573                         mapnum * sizeof (void *), aq->cuda_stream);
1574       block[0] = (void *) dp;
1575
1576       struct nvptx_thread *nvthd =
1577         (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
1578       block[1] = (void *) nvthd->ptx_dev;
1579
1580       if (profiling_p)
1581         {
1582           prof_info->event_type = acc_ev_enqueue_upload_end;
1583           data_event_info.data_event.event_type = prof_info->event_type;
1584           GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
1585                                                 api_info);
1586         }
1587     }
1588
1589   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc,
1590               dp, aq->cuda_stream);
1591
1592   if (mapnum > 0)
1593     GOMP_OFFLOAD_openacc_async_queue_callback (aq, cuda_free_argmem, block);
1594 }
1595
1596 void *
1597 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1598 {
1599   struct ptx_device *ptx_dev;
1600   struct nvptx_thread *nvthd
1601     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1602   CUcontext thd_ctx;
1603
1604   ptx_dev = ptx_devices[ord];
1605
1606   assert (ptx_dev);
1607
1608   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1609
1610   assert (ptx_dev->ctx);
1611
1612   if (!thd_ctx)
1613     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
1614
1615   nvthd->ptx_dev = ptx_dev;
1616
1617   return (void *) nvthd;
1618 }
1619
1620 void
1621 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1622 {
1623   free (data);
1624 }
1625
1626 void *
1627 GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
1628 {
1629   return nvptx_get_current_cuda_device ();
1630 }
1631
1632 void *
1633 GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
1634 {
1635   return nvptx_get_current_cuda_context ();
1636 }
1637
1638 /* This returns a CUstream.  */
1639 void *
1640 GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
1641 {
1642   return (void *) aq->cuda_stream;
1643 }
1644
1645 /* This takes a CUstream.  */
1646 int
1647 GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
1648 {
1649   if (aq->cuda_stream)
1650     {
1651       CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
1652       CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
1653     }
1654
1655   aq->cuda_stream = (CUstream) stream;
1656   return 1;
1657 }
1658
1659 struct goacc_asyncqueue *
1660 GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
1661 {
1662   CUstream stream = NULL;
1663   CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
1664
1665   struct goacc_asyncqueue *aq
1666     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
1667   aq->cuda_stream = stream;
1668   return aq;
1669 }
1670
1671 bool
1672 GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
1673 {
1674   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
1675   free (aq);
1676   return true;
1677 }
1678
1679 int
1680 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
1681 {
1682   CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
1683   if (r == CUDA_SUCCESS)
1684     return 1;
1685   if (r == CUDA_ERROR_NOT_READY)
1686     return 0;
1687
1688   GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
1689   return -1;
1690 }
1691
1692 bool
1693 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
1694 {
1695   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
1696   return true;
1697 }
1698
1699 bool
1700 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
1701                                       struct goacc_asyncqueue *aq2)
1702 {
1703   CUevent e;
1704   CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
1705   CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
1706   CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
1707   return true;
1708 }
1709
1710 static void
1711 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
1712 {
1713   if (res != CUDA_SUCCESS)
1714     GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
1715   struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
1716   cb->fn (cb->ptr);
1717   free (ptr);
1718 }
1719
1720 void
1721 GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
1722                                            void (*callback_fn)(void *),
1723                                            void *userptr)
1724 {
1725   struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
1726   b->fn = callback_fn;
1727   b->ptr = userptr;
1728   b->aq = aq;
1729   CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
1730                     cuda_callback_wrapper, (void *) b, 0);
1731 }
1732
1733 static bool
1734 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
1735 {
1736   CUdeviceptr pb;
1737   size_t ps;
1738   if (!s)
1739     return true;
1740   if (!d)
1741     {
1742       GOMP_PLUGIN_error ("invalid device address");
1743       return false;
1744     }
1745   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1746   if (!pb)
1747     {
1748       GOMP_PLUGIN_error ("invalid device address");
1749       return false;
1750     }
1751   if (!h)
1752     {
1753       GOMP_PLUGIN_error ("invalid host address");
1754       return false;
1755     }
1756   if (d == h)
1757     {
1758       GOMP_PLUGIN_error ("invalid host or device address");
1759       return false;
1760     }
1761   if ((void *)(d + s) > (void *)(pb + ps))
1762     {
1763       GOMP_PLUGIN_error ("invalid size");
1764       return false;
1765     }
1766   return true;
1767 }
1768
1769 bool
1770 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1771 {
1772   if (!nvptx_attach_host_thread_to_device (ord)
1773       || !cuda_memcpy_sanity_check (src, dst, n))
1774     return false;
1775   CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
1776   return true;
1777 }
1778
1779 bool
1780 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1781 {
1782   if (!nvptx_attach_host_thread_to_device (ord)
1783       || !cuda_memcpy_sanity_check (dst, src, n))
1784     return false;
1785   CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
1786   return true;
1787 }
1788
1789 bool
1790 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1791 {
1792   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
1793   return true;
1794 }
1795
1796 bool
1797 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
1798                                      size_t n, struct goacc_asyncqueue *aq)
1799 {
1800   if (!nvptx_attach_host_thread_to_device (ord)
1801       || !cuda_memcpy_sanity_check (src, dst, n))
1802     return false;
1803   CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
1804   return true;
1805 }
1806
1807 bool
1808 GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
1809                                      size_t n, struct goacc_asyncqueue *aq)
1810 {
1811   if (!nvptx_attach_host_thread_to_device (ord)
1812       || !cuda_memcpy_sanity_check (dst, src, n))
1813     return false;
1814   CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
1815   return true;
1816 }
1817
1818 union goacc_property_value
1819 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
1820 {
1821   union goacc_property_value propval = { .val = 0 };
1822
1823   pthread_mutex_lock (&ptx_dev_lock);
1824
1825   if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
1826     {
1827       pthread_mutex_unlock (&ptx_dev_lock);
1828       return propval;
1829     }
1830
1831   struct ptx_device *ptx_dev = ptx_devices[n];
1832   switch (prop)
1833     {
1834     case GOACC_PROPERTY_MEMORY:
1835       {
1836         size_t total_mem;
1837
1838         CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
1839         propval.val = total_mem;
1840       }
1841       break;
1842     case GOACC_PROPERTY_FREE_MEMORY:
1843       {
1844         size_t total_mem;
1845         size_t free_mem;
1846         CUdevice ctxdev;
1847
1848         CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
1849         if (ptx_dev->dev == ctxdev)
1850           CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1851         else if (ptx_dev->ctx)
1852           {
1853             CUcontext old_ctx;
1854
1855             CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
1856             CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1857             CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
1858           }
1859         else
1860           {
1861             CUcontext new_ctx;
1862
1863             CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
1864                             ptx_dev->dev);
1865             CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
1866             CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
1867           }
1868         propval.val = free_mem;
1869       }
1870       break;
1871     case GOACC_PROPERTY_NAME:
1872       propval.ptr = ptx_dev->name;
1873       break;
1874     case GOACC_PROPERTY_VENDOR:
1875       propval.ptr = "Nvidia";
1876       break;
1877     case GOACC_PROPERTY_DRIVER:
1878       propval.ptr = cuda_driver_version_s;
1879       break;
1880     default:
1881       break;
1882     }
1883
1884   pthread_mutex_unlock (&ptx_dev_lock);
1885   return propval;
1886 }
1887
1888 /* Adjust launch dimensions: pick good values for number of blocks and warps
1889    and ensure that number of warps does not exceed CUDA limits as well as GCC's
1890    own limits.  */
1891
1892 static void
1893 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
1894                             struct ptx_device *ptx_dev,
1895                             int *teams_p, int *threads_p)
1896 {
1897   int max_warps_block = fn->max_threads_per_block / 32;
1898   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
1899      and libgcc, which matches documented limit of all GPUs as of 2015.  */
1900   if (max_warps_block > 32)
1901     max_warps_block = 32;
1902   if (*threads_p <= 0)
1903     *threads_p = 8;
1904   if (*threads_p > max_warps_block)
1905     *threads_p = max_warps_block;
1906
1907   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
1908   /* This is an estimate of how many blocks the device can host simultaneously.
1909      Actual limit, which may be lower, can be queried with "occupancy control"
1910      driver interface (since CUDA 6.0).  */
1911   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
1912   if (*teams_p <= 0 || *teams_p > max_blocks)
1913     *teams_p = max_blocks;
1914 }
1915
1916 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
1917    target regions.  */
1918
1919 static size_t
1920 nvptx_stacks_size ()
1921 {
1922   return 128 * 1024;
1923 }
1924
1925 /* Return contiguous storage for NUM stacks, each SIZE bytes.  The lock for
1926    the storage should be held on entry, and remains held on exit.  */
1927
1928 static void *
1929 nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
1930 {
1931   if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
1932     return (void *) ptx_dev->omp_stacks.ptr;
1933
1934   /* Free the old, too-small stacks.  */
1935   if (ptx_dev->omp_stacks.ptr)
1936     {
1937       CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1938       if (r != CUDA_SUCCESS)
1939         GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
1940       r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
1941       if (r != CUDA_SUCCESS)
1942         GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1943     }
1944
1945   /* Make new and bigger stacks, and remember where we put them and how big
1946      they are.  */
1947   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
1948                                   size * num);
1949   if (r != CUDA_SUCCESS)
1950     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1951
1952   ptx_dev->omp_stacks.size = size * num;
1953
1954   return (void *) ptx_dev->omp_stacks.ptr;
1955 }
1956
1957 void
1958 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
1959 {
1960   struct targ_fn_descriptor *tgt_fn_desc
1961     = (struct targ_fn_descriptor *) tgt_fn;
1962   CUfunction function = tgt_fn_desc->fn;
1963   const struct targ_fn_launch *launch = tgt_fn_desc->launch;
1964   const char *fn_name = launch->fn;
1965   CUresult r;
1966   struct ptx_device *ptx_dev = ptx_devices[ord];
1967   const char *maybe_abort_msg = "(perhaps abort was called)";
1968   int teams = 0, threads = 0;
1969
1970   if (!args)
1971     GOMP_PLUGIN_fatal ("No target arguments provided");
1972   while (*args)
1973     {
1974       intptr_t id = (intptr_t) *args++, val;
1975       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
1976         val = (intptr_t) *args++;
1977       else
1978         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
1979       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
1980         continue;
1981       val = val > INT_MAX ? INT_MAX : val;
1982       id &= GOMP_TARGET_ARG_ID_MASK;
1983       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
1984         teams = val;
1985       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
1986         threads = val;
1987     }
1988   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
1989
1990   size_t stack_size = nvptx_stacks_size ();
1991
1992   pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
1993   void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
1994   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
1995   size_t fn_args_size = sizeof fn_args;
1996   void *config[] = {
1997     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
1998     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
1999     CU_LAUNCH_PARAM_END
2000   };
2001   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
2002                      " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
2003                      __FUNCTION__, fn_name, teams, threads);
2004   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2005                          32, threads, 1, 0, NULL, NULL, config);
2006   if (r != CUDA_SUCCESS)
2007     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2008
2009   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2010   if (r == CUDA_ERROR_LAUNCH_FAILED)
2011     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2012                        maybe_abort_msg);
2013   else if (r != CUDA_SUCCESS)
2014     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2015
2016   pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
2017 }
2018
2019 /* TODO: Implement GOMP_OFFLOAD_async_run. */