libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2017 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #include "openacc.h"
  35 #include "config.h"
  36 #include "libgomp-plugin.h"
  37 #include "oacc-plugin.h"
  38 #include "gomp-constants.h"
  39
  40 #include <pthread.h>
  41 #include <cuda.h>
  42 #include <stdbool.h>
  43 #include <stdint.h>
  44 #include <limits.h>
  45 #include <string.h>
  46 #include <stdio.h>
  47 #include <unistd.h>
  48 #include <assert.h>
  49 #include <errno.h>
  50
  51 #if PLUGIN_NVPTX_DYNAMIC
  52 # include <dlfcn.h>
  53
  54 # define CUDA_CALLS \
  55 CUDA_ONE_CALL (cuCtxCreate)             \
  56 CUDA_ONE_CALL (cuCtxDestroy)            \
  57 CUDA_ONE_CALL (cuCtxGetCurrent)         \
  58 CUDA_ONE_CALL (cuCtxGetDevice)          \
  59 CUDA_ONE_CALL (cuCtxPopCurrent)         \
  60 CUDA_ONE_CALL (cuCtxPushCurrent)        \
  61 CUDA_ONE_CALL (cuCtxSynchronize)        \
  62 CUDA_ONE_CALL (cuDeviceGet)             \
  63 CUDA_ONE_CALL (cuDeviceGetAttribute)    \
  64 CUDA_ONE_CALL (cuDeviceGetCount)        \
  65 CUDA_ONE_CALL (cuEventCreate)           \
  66 CUDA_ONE_CALL (cuEventDestroy)          \
  67 CUDA_ONE_CALL (cuEventElapsedTime)      \
  68 CUDA_ONE_CALL (cuEventQuery)            \
  69 CUDA_ONE_CALL (cuEventRecord)           \
  70 CUDA_ONE_CALL (cuEventSynchronize)      \
  71 CUDA_ONE_CALL (cuFuncGetAttribute)      \
  72 CUDA_ONE_CALL (cuGetErrorString)        \
  73 CUDA_ONE_CALL (cuInit)                  \
  74 CUDA_ONE_CALL (cuLaunchKernel)          \
  75 CUDA_ONE_CALL (cuLinkAddData)           \
  76 CUDA_ONE_CALL (cuLinkComplete)          \
  77 CUDA_ONE_CALL (cuLinkCreate)            \
  78 CUDA_ONE_CALL (cuLinkDestroy)           \
  79 CUDA_ONE_CALL (cuMemAlloc)              \
  80 CUDA_ONE_CALL (cuMemAllocHost)          \
  81 CUDA_ONE_CALL (cuMemcpy)                \
  82 CUDA_ONE_CALL (cuMemcpyDtoDAsync)       \
  83 CUDA_ONE_CALL (cuMemcpyDtoH)            \
  84 CUDA_ONE_CALL (cuMemcpyDtoHAsync)       \
  85 CUDA_ONE_CALL (cuMemcpyHtoD)            \
  86 CUDA_ONE_CALL (cuMemcpyHtoDAsync)       \
  87 CUDA_ONE_CALL (cuMemFree)               \
  88 CUDA_ONE_CALL (cuMemFreeHost)           \
  89 CUDA_ONE_CALL (cuMemGetAddressRange)    \
  90 CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
  91 CUDA_ONE_CALL (cuModuleGetFunction)     \
  92 CUDA_ONE_CALL (cuModuleGetGlobal)       \
  93 CUDA_ONE_CALL (cuModuleLoad)            \
  94 CUDA_ONE_CALL (cuModuleLoadData)        \
  95 CUDA_ONE_CALL (cuModuleUnload)          \
  96 CUDA_ONE_CALL (cuStreamCreate)          \
  97 CUDA_ONE_CALL (cuStreamDestroy)         \
  98 CUDA_ONE_CALL (cuStreamQuery)           \
  99 CUDA_ONE_CALL (cuStreamSynchronize)     \
 100 CUDA_ONE_CALL (cuStreamWaitEvent)
 101 # define CUDA_ONE_CALL(call) \
 102   __typeof (call) *call;
 103 struct cuda_lib_s {
 104   CUDA_CALLS
 105 } cuda_lib;
 106
 107 /* -1 if init_cuda_lib has not been called yet, false
 108    if it has been and failed, true if it has been and succeeded.  */
 109 static char cuda_lib_inited = -1;
 110
 111 /* Dynamically load the CUDA runtime library and initialize function
 112    pointers, return false if unsuccessful, true if successful.  */
 113 static bool
 114 init_cuda_lib (void)
 115 {
 116   if (cuda_lib_inited != -1)
 117     return cuda_lib_inited;
 118   const char *cuda_runtime_lib = "libcuda.so.1";
 119   void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
 120   cuda_lib_inited = false;
 121   if (h == NULL)
 122     return false;
 123 # undef CUDA_ONE_CALL
 124 # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
 125 # define CUDA_ONE_CALL_1(call) \
 126   cuda_lib.call = dlsym (h, #call);     \
 127   if (cuda_lib.call == NULL)            \
 128     return false;
 129   CUDA_CALLS
 130   cuda_lib_inited = true;
 131   return true;
 132 }
 133 # undef CUDA_ONE_CALL
 134 # undef CUDA_ONE_CALL_1
 135 # define CUDA_CALL_PREFIX cuda_lib.
 136 #else
 137 # define CUDA_CALL_PREFIX
 138 # define init_cuda_lib() true
 139 #endif
 140
 141 /* Convenience macros for the frequently used CUDA library call and
 142    error handling sequence as well as CUDA library calls that
 143    do the error checking themselves or don't do it at all.  */
 144
 145 #define CUDA_CALL_ERET(ERET, FN, ...)           \
 146   do {                                          \
 147     unsigned __r                                \
 148       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 149     if (__r != CUDA_SUCCESS)                    \
 150       {                                         \
 151         GOMP_PLUGIN_error (#FN " error: %s",    \
 152                            cuda_error (__r));   \
 153         return ERET;                            \
 154       }                                         \
 155   } while (0)
 156
 157 #define CUDA_CALL(FN, ...)                      \
 158   CUDA_CALL_ERET (false, FN, __VA_ARGS__)
 159
 160 #define CUDA_CALL_ASSERT(FN, ...)               \
 161   do {                                          \
 162     unsigned __r                                \
 163       = CUDA_CALL_PREFIX FN (__VA_ARGS__);      \
 164     if (__r != CUDA_SUCCESS)                    \
 165       {                                         \
 166         GOMP_PLUGIN_fatal (#FN " error: %s",    \
 167                            cuda_error (__r));   \
 168       }                                         \
 169   } while (0)
 170
 171 #define CUDA_CALL_NOCHECK(FN, ...)              \
 172   CUDA_CALL_PREFIX FN (__VA_ARGS__)
 173
 174 static const char *
 175 cuda_error (CUresult r)
 176 {
 177 #if CUDA_VERSION < 7000
 178   /* Specified in documentation and present in library from at least
 179      5.5.  Not declared in header file prior to 7.0.  */
 180   extern CUresult cuGetErrorString (CUresult, const char **);
 181 #endif
 182   const char *desc;
 183
 184   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
 185   if (r != CUDA_SUCCESS)
 186     desc = "unknown cuda error";
 187
 188   return desc;
 189 }
 190
 191 static unsigned int instantiated_devices = 0;
 192 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 193
 194 struct ptx_stream
 195 {
 196   CUstream stream;
 197   pthread_t host_thread;
 198   bool multithreaded;
 199
 200   CUdeviceptr d;
 201   void *h;
 202   void *h_begin;
 203   void *h_end;
 204   void *h_next;
 205   void *h_prev;
 206   void *h_tail;
 207
 208   struct ptx_stream *next;
 209 };
 210
 211 /* Thread-specific data for PTX.  */
 212
 213 struct nvptx_thread
 214 {
 215   struct ptx_stream *current_stream;
 216   struct ptx_device *ptx_dev;
 217 };
 218
 219 struct map
 220 {
 221   int     async;
 222   size_t  size;
 223   char    mappings[0];
 224 };
 225
 226 static bool
 227 map_init (struct ptx_stream *s)
 228 {
 229   int size = getpagesize ();
 230
 231   assert (s);
 232   assert (!s->d);
 233   assert (!s->h);
 234
 235   CUDA_CALL (cuMemAllocHost, &s->h, size);
 236   CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
 237
 238   assert (s->h);
 239
 240   s->h_begin = s->h;
 241   s->h_end = s->h_begin + size;
 242   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 243
 244   assert (s->h_next);
 245   assert (s->h_end);
 246   return true;
 247 }
 248
 249 static bool
 250 map_fini (struct ptx_stream *s)
 251 {
 252   CUDA_CALL (cuMemFreeHost, s->h);
 253   return true;
 254 }
 255
 256 static void
 257 map_pop (struct ptx_stream *s)
 258 {
 259   struct map *m;
 260
 261   assert (s != NULL);
 262   assert (s->h_next);
 263   assert (s->h_prev);
 264   assert (s->h_tail);
 265
 266   m = s->h_tail;
 267
 268   s->h_tail += m->size;
 269
 270   if (s->h_tail >= s->h_end)
 271     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 272
 273   if (s->h_next == s->h_tail)
 274     s->h_prev = s->h_next;
 275
 276   assert (s->h_next >= s->h_begin);
 277   assert (s->h_tail >= s->h_begin);
 278   assert (s->h_prev >= s->h_begin);
 279
 280   assert (s->h_next <= s->h_end);
 281   assert (s->h_tail <= s->h_end);
 282   assert (s->h_prev <= s->h_end);
 283 }
 284
 285 static void
 286 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
 287 {
 288   int left;
 289   int offset;
 290   struct map *m;
 291
 292   assert (s != NULL);
 293
 294   left = s->h_end - s->h_next;
 295   size += sizeof (struct map);
 296
 297   assert (s->h_prev);
 298   assert (s->h_next);
 299
 300   if (size >= left)
 301     {
 302       m = s->h_prev;
 303       m->size += left;
 304       s->h_next = s->h_begin;
 305
 306       if (s->h_next + size > s->h_end)
 307         GOMP_PLUGIN_fatal ("unable to push map");
 308     }
 309
 310   assert (s->h_next);
 311
 312   m = s->h_next;
 313   m->async = async;
 314   m->size = size;
 315
 316   offset = (void *)&m->mappings[0] - s->h;
 317
 318   *d = (void *)(s->d + offset);
 319   *h = (void *)(s->h + offset);
 320
 321   s->h_prev = s->h_next;
 322   s->h_next += size;
 323
 324   assert (s->h_prev);
 325   assert (s->h_next);
 326
 327   assert (s->h_next >= s->h_begin);
 328   assert (s->h_tail >= s->h_begin);
 329   assert (s->h_prev >= s->h_begin);
 330   assert (s->h_next <= s->h_end);
 331   assert (s->h_tail <= s->h_end);
 332   assert (s->h_prev <= s->h_end);
 333
 334   return;
 335 }
 336
 337 /* Target data function launch information.  */
 338
 339 struct targ_fn_launch
 340 {
 341   const char *fn;
 342   unsigned short dim[GOMP_DIM_MAX];
 343 };
 344
 345 /* Target PTX object information.  */
 346
 347 struct targ_ptx_obj
 348 {
 349   const char *code;
 350   size_t size;
 351 };
 352
 353 /* Target data image information.  */
 354
 355 typedef struct nvptx_tdata
 356 {
 357   const struct targ_ptx_obj *ptx_objs;
 358   unsigned ptx_num;
 359
 360   const char *const *var_names;
 361   unsigned var_num;
 362
 363   const struct targ_fn_launch *fn_descs;
 364   unsigned fn_num;
 365 } nvptx_tdata_t;
 366
 367 /* Descriptor of a loaded function.  */
 368
 369 struct targ_fn_descriptor
 370 {
 371   CUfunction fn;
 372   const struct targ_fn_launch *launch;
 373   int regs_per_thread;
 374   int max_threads_per_block;
 375 };
 376
 377 /* A loaded PTX image.  */
 378 struct ptx_image_data
 379 {
 380   const void *target_data;
 381   CUmodule module;
 382
 383   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 384
 385   struct ptx_image_data *next;
 386 };
 387
 388 struct ptx_device
 389 {
 390   CUcontext ctx;
 391   bool ctx_shared;
 392   CUdevice dev;
 393   struct ptx_stream *null_stream;
 394   /* All non-null streams associated with this device (actually context),
 395      either created implicitly or passed in from the user (via
 396      acc_set_cuda_stream).  */
 397   struct ptx_stream *active_streams;
 398   struct {
 399     struct ptx_stream **arr;
 400     int size;
 401   } async_streams;
 402   /* A lock for use when manipulating the above stream list and array.  */
 403   pthread_mutex_t stream_lock;
 404   int ord;
 405   bool overlap;
 406   bool map;
 407   bool concur;
 408   bool mkern;
 409   int  mode;
 410   int clock_khz;
 411   int num_sms;
 412   int regs_per_block;
 413   int regs_per_sm;
 414
 415   struct ptx_image_data *images;  /* Images loaded on device.  */
 416   pthread_mutex_t image_lock;     /* Lock for above list.  */
 417
 418   struct ptx_device *next;
 419 };
 420
 421 enum ptx_event_type
 422 {
 423   PTX_EVT_MEM,
 424   PTX_EVT_KNL,
 425   PTX_EVT_SYNC,
 426   PTX_EVT_ASYNC_CLEANUP
 427 };
 428
 429 struct ptx_event
 430 {
 431   CUevent *evt;
 432   int type;
 433   void *addr;
 434   int ord;
 435   int val;
 436
 437   struct ptx_event *next;
 438 };
 439
 440 static pthread_mutex_t ptx_event_lock;
 441 static struct ptx_event *ptx_events;
 442
 443 static struct ptx_device **ptx_devices;
 444
 445 static inline struct nvptx_thread *
 446 nvptx_thread (void)
 447 {
 448   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 449 }
 450
 451 static bool
 452 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 453 {
 454   int i;
 455   struct ptx_stream *null_stream
 456     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 457
 458   null_stream->stream = NULL;
 459   null_stream->host_thread = pthread_self ();
 460   null_stream->multithreaded = true;
 461   null_stream->d = (CUdeviceptr) NULL;
 462   null_stream->h = NULL;
 463   if (!map_init (null_stream))
 464     return false;
 465
 466   ptx_dev->null_stream = null_stream;
 467   ptx_dev->active_streams = NULL;
 468   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 469
 470   if (concurrency < 1)
 471     concurrency = 1;
 472
 473   /* This is just a guess -- make space for as many async streams as the
 474      current device is capable of concurrently executing.  This can grow
 475      later as necessary.  No streams are created yet.  */
 476   ptx_dev->async_streams.arr
 477     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 478   ptx_dev->async_streams.size = concurrency;
 479
 480   for (i = 0; i < concurrency; i++)
 481     ptx_dev->async_streams.arr[i] = NULL;
 482
 483   return true;
 484 }
 485
 486 static bool
 487 fini_streams_for_device (struct ptx_device *ptx_dev)
 488 {
 489   free (ptx_dev->async_streams.arr);
 490
 491   bool ret = true;
 492   while (ptx_dev->active_streams != NULL)
 493     {
 494       struct ptx_stream *s = ptx_dev->active_streams;
 495       ptx_dev->active_streams = ptx_dev->active_streams->next;
 496
 497       ret &= map_fini (s);
 498
 499       CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
 500       if (r != CUDA_SUCCESS)
 501         {
 502           GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
 503           ret = false;
 504         }
 505       free (s);
 506     }
 507
 508   ret &= map_fini (ptx_dev->null_stream);
 509   free (ptx_dev->null_stream);
 510   return ret;
 511 }
 512
 513 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 514    thread THREAD (and also current device/context).  If CREATE is true, create
 515    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 516    associate the stream with the same thread argument.  Returns stream to use
 517    as result.  */
 518
 519 static struct ptx_stream *
 520 select_stream_for_async (int async, pthread_t thread, bool create,
 521                          CUstream existing)
 522 {
 523   struct nvptx_thread *nvthd = nvptx_thread ();
 524   /* Local copy of TLS variable.  */
 525   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 526   struct ptx_stream *stream = NULL;
 527   int orig_async = async;
 528
 529   /* The special value acc_async_noval (-1) maps (for now) to an
 530      implicitly-created stream, which is then handled the same as any other
 531      numbered async stream.  Other options are available, e.g. using the null
 532      stream for anonymous async operations, or choosing an idle stream from an
 533      active set.  But, stick with this for now.  */
 534   if (async > acc_async_sync)
 535     async++;
 536
 537   if (create)
 538     pthread_mutex_lock (&ptx_dev->stream_lock);
 539
 540   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 541      null stream, and in fact better performance may be obtainable if it doesn't
 542      (because the null stream enforces overly-strict synchronisation with
 543      respect to other streams for legacy reasons, and that's probably not
 544      needed with OpenACC).  Maybe investigate later.  */
 545   if (async == acc_async_sync)
 546     stream = ptx_dev->null_stream;
 547   else if (async >= 0 && async < ptx_dev->async_streams.size
 548            && ptx_dev->async_streams.arr[async] && !(create && existing))
 549     stream = ptx_dev->async_streams.arr[async];
 550   else if (async >= 0 && create)
 551     {
 552       if (async >= ptx_dev->async_streams.size)
 553         {
 554           int i, newsize = ptx_dev->async_streams.size * 2;
 555
 556           if (async >= newsize)
 557             newsize = async + 1;
 558
 559           ptx_dev->async_streams.arr
 560             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 561                                    newsize * sizeof (struct ptx_stream *));
 562
 563           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 564             ptx_dev->async_streams.arr[i] = NULL;
 565
 566           ptx_dev->async_streams.size = newsize;
 567         }
 568
 569       /* Create a new stream on-demand if there isn't one already, or if we're
 570          setting a particular async value to an existing (externally-provided)
 571          stream.  */
 572       if (!ptx_dev->async_streams.arr[async] || existing)
 573         {
 574           CUresult r;
 575           struct ptx_stream *s
 576             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 577
 578           if (existing)
 579             s->stream = existing;
 580           else
 581             {
 582               r = CUDA_CALL_NOCHECK (cuStreamCreate, &s->stream,
 583                                      CU_STREAM_DEFAULT);
 584               if (r != CUDA_SUCCESS)
 585                 {
 586                   pthread_mutex_unlock (&ptx_dev->stream_lock);
 587                   GOMP_PLUGIN_fatal ("cuStreamCreate error: %s",
 588                                      cuda_error (r));
 589                 }
 590             }
 591
 592           /* If CREATE is true, we're going to be queueing some work on this
 593              stream.  Associate it with the current host thread.  */
 594           s->host_thread = thread;
 595           s->multithreaded = false;
 596
 597           s->d = (CUdeviceptr) NULL;
 598           s->h = NULL;
 599           if (!map_init (s))
 600             {
 601               pthread_mutex_unlock (&ptx_dev->stream_lock);
 602               GOMP_PLUGIN_fatal ("map_init fail");
 603             }
 604
 605           s->next = ptx_dev->active_streams;
 606           ptx_dev->active_streams = s;
 607           ptx_dev->async_streams.arr[async] = s;
 608         }
 609
 610       stream = ptx_dev->async_streams.arr[async];
 611     }
 612   else if (async < 0)
 613     {
 614       if (create)
 615         pthread_mutex_unlock (&ptx_dev->stream_lock);
 616       GOMP_PLUGIN_fatal ("bad async %d", async);
 617     }
 618
 619   if (create)
 620     {
 621       assert (stream != NULL);
 622
 623       /* If we're trying to use the same stream from different threads
 624          simultaneously, set stream->multithreaded to true.  This affects the
 625          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 626          only wait for asynchronous launches from the same host thread they are
 627          invoked on.  If multiple threads use the same async value, we make note
 628          of that here and fall back to testing/waiting for all threads in those
 629          functions.  */
 630       if (thread != stream->host_thread)
 631         stream->multithreaded = true;
 632
 633       pthread_mutex_unlock (&ptx_dev->stream_lock);
 634     }
 635   else if (stream && !stream->multithreaded
 636            && !pthread_equal (stream->host_thread, thread))
 637     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 638
 639   return stream;
 640 }
 641
 642 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 643    should be locked on entry and remains locked on exit.  */
 644
 645 static bool
 646 nvptx_init (void)
 647 {
 648   int ndevs;
 649
 650   if (instantiated_devices != 0)
 651     return true;
 652
 653   ptx_events = NULL;
 654   pthread_mutex_init (&ptx_event_lock, NULL);
 655
 656   if (!init_cuda_lib ())
 657     return false;
 658
 659   CUDA_CALL (cuInit, 0);
 660
 661   CUDA_CALL (cuDeviceGetCount, &ndevs);
 662   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 663                                             * ndevs);
 664   return true;
 665 }
 666
 667 /* Select the N'th PTX device for the current host thread.  The device must
 668    have been previously opened before calling this function.  */
 669
 670 static bool
 671 nvptx_attach_host_thread_to_device (int n)
 672 {
 673   CUdevice dev;
 674   CUresult r;
 675   struct ptx_device *ptx_dev;
 676   CUcontext thd_ctx;
 677
 678   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
 679   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 680     {
 681       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 682       return false;
 683     }
 684
 685   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 686     return true;
 687   else
 688     {
 689       CUcontext old_ctx;
 690
 691       ptx_dev = ptx_devices[n];
 692       if (!ptx_dev)
 693         {
 694           GOMP_PLUGIN_error ("device %d not found", n);
 695           return false;
 696         }
 697
 698       CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
 699
 700       /* We don't necessarily have a current context (e.g. if it has been
 701          destroyed.  Pop it if we do though.  */
 702       if (thd_ctx != NULL)
 703         CUDA_CALL (cuCtxPopCurrent, &old_ctx);
 704
 705       CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
 706     }
 707   return true;
 708 }
 709
 710 static struct ptx_device *
 711 nvptx_open_device (int n)
 712 {
 713   struct ptx_device *ptx_dev;
 714   CUdevice dev, ctx_dev;
 715   CUresult r;
 716   int async_engines, pi;
 717
 718   CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
 719
 720   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 721
 722   ptx_dev->ord = n;
 723   ptx_dev->dev = dev;
 724   ptx_dev->ctx_shared = false;
 725
 726   r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
 727   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 728     {
 729       GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
 730       return NULL;
 731     }
 732
 733   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 734     {
 735       /* The current host thread has an active context for a different device.
 736          Detach it.  */
 737       CUcontext old_ctx;
 738       CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
 739     }
 740
 741   CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
 742
 743   if (!ptx_dev->ctx)
 744     CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 745   else
 746     ptx_dev->ctx_shared = true;
 747
 748   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 749                   &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 750   ptx_dev->overlap = pi;
 751
 752   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 753                   &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 754   ptx_dev->map = pi;
 755
 756   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 757                   &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 758   ptx_dev->concur = pi;
 759
 760   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 761                   &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 762   ptx_dev->mode = pi;
 763
 764   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 765                   &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 766   ptx_dev->mkern = pi;
 767
 768   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 769                   &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
 770   ptx_dev->clock_khz = pi;
 771
 772   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 773                   &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
 774   ptx_dev->num_sms = pi;
 775
 776   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 777                   &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
 778   ptx_dev->regs_per_block = pi;
 779
 780   /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
 781      in CUDA 6.0 and newer.  */
 782   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
 783   /* Fallback: use limit of registers per block, which is usually equal.  */
 784   if (r == CUDA_ERROR_INVALID_VALUE)
 785     pi = ptx_dev->regs_per_block;
 786   else if (r != CUDA_SUCCESS)
 787     {
 788       GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
 789       return NULL;
 790     }
 791   ptx_dev->regs_per_sm = pi;
 792
 793   CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
 794                   &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
 795   if (pi != 32)
 796     {
 797       GOMP_PLUGIN_error ("Only warp size 32 is supported");
 798       return NULL;
 799     }
 800
 801   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
 802                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 803   if (r != CUDA_SUCCESS)
 804     async_engines = 1;
 805
 806   ptx_dev->images = NULL;
 807   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 808
 809   if (!init_streams_for_device (ptx_dev, async_engines))
 810     return NULL;
 811
 812   return ptx_dev;
 813 }
 814
 815 static bool
 816 nvptx_close_device (struct ptx_device *ptx_dev)
 817 {
 818   if (!ptx_dev)
 819     return true;
 820
 821   if (!fini_streams_for_device (ptx_dev))
 822     return false;
 823
 824   pthread_mutex_destroy (&ptx_dev->image_lock);
 825
 826   if (!ptx_dev->ctx_shared)
 827     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 828
 829   free (ptx_dev);
 830   return true;
 831 }
 832
 833 static int
 834 nvptx_get_num_devices (void)
 835 {
 836   int n;
 837
 838   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 839      configurations.  */
 840   if (sizeof (void *) != 8)
 841     return 0;
 842
 843   /* This function will be called before the plugin has been initialized in
 844      order to enumerate available devices, but CUDA API routines can't be used
 845      until cuInit has been called.  Just call it now (but don't yet do any
 846      further initialization).  */
 847   if (instantiated_devices == 0)
 848     {
 849       if (!init_cuda_lib ())
 850         return 0;
 851       CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
 852       /* This is not an error: e.g. we may have CUDA libraries installed but
 853          no devices available.  */
 854       if (r != CUDA_SUCCESS)
 855         return 0;
 856     }
 857
 858   CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
 859   return n;
 860 }
 861
 862
 863 static bool
 864 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 865           unsigned num_objs)
 866 {
 867   CUjit_option opts[6];
 868   void *optvals[6];
 869   float elapsed = 0.0;
 870   char elog[1024];
 871   char ilog[16384];
 872   CUlinkState linkstate;
 873   CUresult r;
 874   void *linkout;
 875   size_t linkoutsize __attribute__ ((unused));
 876
 877   opts[0] = CU_JIT_WALL_TIME;
 878   optvals[0] = &elapsed;
 879
 880   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 881   optvals[1] = &ilog[0];
 882
 883   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 884   optvals[2] = (void *) sizeof ilog;
 885
 886   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 887   optvals[3] = &elog[0];
 888
 889   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 890   optvals[4] = (void *) sizeof elog;
 891
 892   opts[5] = CU_JIT_LOG_VERBOSE;
 893   optvals[5] = (void *) 1;
 894
 895   CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
 896
 897   for (; num_objs--; ptx_objs++)
 898     {
 899       /* cuLinkAddData's 'data' argument erroneously omits the const
 900          qualifier.  */
 901       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
 902       r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
 903                              (char *) ptx_objs->code, ptx_objs->size,
 904                              0, 0, 0, 0);
 905       if (r != CUDA_SUCCESS)
 906         {
 907           GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 908           GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
 909                              cuda_error (r));
 910           return false;
 911         }
 912     }
 913
 914   GOMP_PLUGIN_debug (0, "Linking\n");
 915   r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
 916
 917   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 918   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 919
 920   if (r != CUDA_SUCCESS)
 921     {
 922       GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
 923       return false;
 924     }
 925
 926   CUDA_CALL (cuModuleLoadData, module, linkout);
 927   CUDA_CALL (cuLinkDestroy, linkstate);
 928   return true;
 929 }
 930
 931 static void
 932 event_gc (bool memmap_lockable)
 933 {
 934   struct ptx_event *ptx_event = ptx_events;
 935   struct ptx_event *async_cleanups = NULL;
 936   struct nvptx_thread *nvthd = nvptx_thread ();
 937
 938   pthread_mutex_lock (&ptx_event_lock);
 939
 940   while (ptx_event != NULL)
 941     {
 942       CUresult r;
 943       struct ptx_event *e = ptx_event;
 944
 945       ptx_event = ptx_event->next;
 946
 947       if (e->ord != nvthd->ptx_dev->ord)
 948         continue;
 949
 950       r = CUDA_CALL_NOCHECK (cuEventQuery, *e->evt);
 951       if (r == CUDA_SUCCESS)
 952         {
 953           bool append_async = false;
 954           CUevent *te;
 955
 956           te = e->evt;
 957
 958           switch (e->type)
 959             {
 960             case PTX_EVT_MEM:
 961             case PTX_EVT_SYNC:
 962               break;
 963
 964             case PTX_EVT_KNL:
 965               map_pop (e->addr);
 966               break;
 967
 968             case PTX_EVT_ASYNC_CLEANUP:
 969               {
 970                 /* The function gomp_plugin_async_unmap_vars needs to claim the
 971                    memory-map splay tree lock for the current device, so we
 972                    can't call it when one of our callers has already claimed
 973                    the lock.  In that case, just delay the GC for this event
 974                    until later.  */
 975                 if (!memmap_lockable)
 976                   continue;
 977
 978                 append_async = true;
 979               }
 980               break;
 981             }
 982
 983           CUDA_CALL_NOCHECK (cuEventDestroy, *te);
 984           free ((void *)te);
 985
 986           /* Unlink 'e' from ptx_events list.  */
 987           if (ptx_events == e)
 988             ptx_events = ptx_events->next;
 989           else
 990             {
 991               struct ptx_event *e_ = ptx_events;
 992               while (e_->next != e)
 993                 e_ = e_->next;
 994               e_->next = e_->next->next;
 995             }
 996
 997           if (append_async)
 998             {
 999               e->next = async_cleanups;
1000               async_cleanups = e;
1001             }
1002           else
1003             free (e);
1004         }
1005     }
1006
1007   pthread_mutex_unlock (&ptx_event_lock);
1008
1009   /* We have to do these here, after ptx_event_lock is released.  */
1010   while (async_cleanups)
1011     {
1012       struct ptx_event *e = async_cleanups;
1013       async_cleanups = async_cleanups->next;
1014
1015       GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
1016       free (e);
1017     }
1018 }
1019
1020 static void
1021 event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
1022 {
1023   struct ptx_event *ptx_event;
1024   struct nvptx_thread *nvthd = nvptx_thread ();
1025
1026   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
1027           || type == PTX_EVT_ASYNC_CLEANUP);
1028
1029   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
1030   ptx_event->type = type;
1031   ptx_event->evt = e;
1032   ptx_event->addr = h;
1033   ptx_event->ord = nvthd->ptx_dev->ord;
1034   ptx_event->val = val;
1035
1036   pthread_mutex_lock (&ptx_event_lock);
1037
1038   ptx_event->next = ptx_events;
1039   ptx_events = ptx_event;
1040
1041   pthread_mutex_unlock (&ptx_event_lock);
1042 }
1043
1044 void
1045 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1046             int async, unsigned *dims, void *targ_mem_desc)
1047 {
1048   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1049   CUfunction function;
1050   CUresult r;
1051   int i;
1052   struct ptx_stream *dev_str;
1053   void *kargs[1];
1054   void *hp, *dp;
1055   struct nvptx_thread *nvthd = nvptx_thread ();
1056   const char *maybe_abort_msg = "(perhaps abort was called)";
1057
1058   function = targ_fn->fn;
1059
1060   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1061   assert (dev_str == nvthd->current_stream);
1062
1063   /* Initialize the launch dimensions.  Typically this is constant,
1064      provided by the device compiler, but we must permit runtime
1065      values.  */
1066   int seen_zero = 0;
1067   for (i = 0; i != GOMP_DIM_MAX; i++)
1068     {
1069       if (targ_fn->launch->dim[i])
1070        dims[i] = targ_fn->launch->dim[i];
1071       if (!dims[i])
1072        seen_zero = 1;
1073     }
1074
1075   if (seen_zero)
1076     {
1077       /* See if the user provided GOMP_OPENACC_DIM environment
1078          variable to specify runtime defaults. */
1079       static int default_dims[GOMP_DIM_MAX];
1080
1081       pthread_mutex_lock (&ptx_dev_lock);
1082       if (!default_dims[0])
1083         {
1084           /* We only read the environment variable once.  You can't
1085              change it in the middle of execution.  The syntax  is
1086              the same as for the -fopenacc-dim compilation option.  */
1087           const char *env_var = getenv ("GOMP_OPENACC_DIM");
1088           if (env_var)
1089             {
1090               const char *pos = env_var;
1091
1092               for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
1093                 {
1094                   if (i && *pos++ != ':')
1095                     break;
1096                   if (*pos != ':')
1097                     {
1098                       const char *eptr;
1099
1100                       errno = 0;
1101                       long val = strtol (pos, (char **)&eptr, 10);
1102                       if (errno || val < 0 || (unsigned)val != val)
1103                         break;
1104                       default_dims[i] = (int)val;
1105                       pos = eptr;
1106                     }
1107                 }
1108             }
1109
1110           int warp_size, block_size, dev_size, cpu_size;
1111           CUdevice dev = nvptx_thread()->ptx_dev->dev;
1112           /* 32 is the default for known hardware.  */
1113           int gang = 0, worker = 32, vector = 32;
1114           CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
1115
1116           cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
1117           cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
1118           cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
1119           cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
1120
1121           if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
1122                                  dev) == CUDA_SUCCESS
1123               && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
1124                                     dev) == CUDA_SUCCESS
1125               && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
1126                                     dev) == CUDA_SUCCESS
1127               && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
1128                                     dev) == CUDA_SUCCESS)
1129             {
1130               GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
1131                                  " dev_size=%d, cpu_size=%d\n",
1132                                  warp_size, block_size, dev_size, cpu_size);
1133               gang = (cpu_size / block_size) * dev_size;
1134               worker = block_size / warp_size;
1135               vector = warp_size;
1136             }
1137
1138           /* There is no upper bound on the gang size.  The best size
1139              matches the hardware configuration.  Logical gangs are
1140              scheduled onto physical hardware.  To maximize usage, we
1141              should guess a large number.  */
1142           if (default_dims[GOMP_DIM_GANG] < 1)
1143             default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
1144           /* The worker size must not exceed the hardware.  */
1145           if (default_dims[GOMP_DIM_WORKER] < 1
1146               || (default_dims[GOMP_DIM_WORKER] > worker && gang))
1147             default_dims[GOMP_DIM_WORKER] = worker;
1148           /* The vector size must exactly match the hardware.  */
1149           if (default_dims[GOMP_DIM_VECTOR] < 1
1150               || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
1151             default_dims[GOMP_DIM_VECTOR] = vector;
1152
1153           GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
1154                              default_dims[GOMP_DIM_GANG],
1155                              default_dims[GOMP_DIM_WORKER],
1156                              default_dims[GOMP_DIM_VECTOR]);
1157         }
1158       pthread_mutex_unlock (&ptx_dev_lock);
1159
1160       for (i = 0; i != GOMP_DIM_MAX; i++)
1161         if (!dims[i])
1162           dims[i] = default_dims[i];
1163     }
1164
1165   /* This reserves a chunk of a pre-allocated page of memory mapped on both
1166      the host and the device. HP is a host pointer to the new chunk, and DP is
1167      the corresponding device pointer.  */
1168   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1169
1170   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1171
1172   /* Copy the array of arguments to the mapped page.  */
1173   for (i = 0; i < mapnum; i++)
1174     ((void **) hp)[i] = devaddrs[i];
1175
1176   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1177      fact have the same value on a unified-memory system).  */
1178   CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
1179                     mapnum * sizeof (void *));
1180   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
1181                      " gangs=%u, workers=%u, vectors=%u\n",
1182                      __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
1183                      dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
1184
1185   // OpenACC            CUDA
1186   //
1187   // num_gangs          nctaid.x
1188   // num_workers        ntid.y
1189   // vector length      ntid.x
1190
1191   kargs[0] = &dp;
1192   CUDA_CALL_ASSERT (cuLaunchKernel, function,
1193                     dims[GOMP_DIM_GANG], 1, 1,
1194                     dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
1195                     0, dev_str->stream, kargs, 0);
1196
1197 #ifndef DISABLE_ASYNC
1198   if (async < acc_async_noval)
1199     {
1200       r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream);
1201       if (r == CUDA_ERROR_LAUNCH_FAILED)
1202         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1203                            maybe_abort_msg);
1204       else if (r != CUDA_SUCCESS)
1205         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1206     }
1207   else
1208     {
1209       CUevent *e;
1210
1211       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1212
1213       r = CUDA_CALL_NOCHECK (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1214       if (r == CUDA_ERROR_LAUNCH_FAILED)
1215         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1216                            maybe_abort_msg);
1217       else if (r != CUDA_SUCCESS)
1218         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1219
1220       event_gc (true);
1221
1222       CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
1223
1224       event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
1225     }
1226 #else
1227   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
1228   if (r == CUDA_ERROR_LAUNCH_FAILED)
1229     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1230                        maybe_abort_msg);
1231   else if (r != CUDA_SUCCESS)
1232     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1233 #endif
1234
1235   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1236                      targ_fn->launch->fn);
1237
1238 #ifndef DISABLE_ASYNC
1239   if (async < acc_async_noval)
1240 #endif
1241     map_pop (dev_str);
1242 }
1243
1244 void * openacc_get_current_cuda_context (void);
1245
1246 static void *
1247 nvptx_alloc (size_t s)
1248 {
1249   CUdeviceptr d;
1250
1251   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
1252   return (void *) d;
1253 }
1254
1255 static bool
1256 nvptx_free (void *p)
1257 {
1258   CUdeviceptr pb;
1259   size_t ps;
1260
1261   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) p);
1262   if ((CUdeviceptr) p != pb)
1263     {
1264       GOMP_PLUGIN_error ("invalid device address");
1265       return false;
1266     }
1267
1268   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
1269   return true;
1270 }
1271
1272
1273 static bool
1274 nvptx_host2dev (void *d, const void *h, size_t s)
1275 {
1276   CUdeviceptr pb;
1277   size_t ps;
1278   struct nvptx_thread *nvthd = nvptx_thread ();
1279
1280   if (!s)
1281     return true;
1282   if (!d)
1283     {
1284       GOMP_PLUGIN_error ("invalid device address");
1285       return false;
1286     }
1287
1288   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1289
1290   if (!pb)
1291     {
1292       GOMP_PLUGIN_error ("invalid device address");
1293       return false;
1294     }
1295   if (!h)
1296     {
1297       GOMP_PLUGIN_error ("invalid host address");
1298       return false;
1299     }
1300   if (d == h)
1301     {
1302       GOMP_PLUGIN_error ("invalid host or device address");
1303       return false;
1304     }
1305   if ((void *)(d + s) > (void *)(pb + ps))
1306     {
1307       GOMP_PLUGIN_error ("invalid size");
1308       return false;
1309     }
1310
1311 #ifndef DISABLE_ASYNC
1312   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1313     {
1314       CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1315       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1316       event_gc (false);
1317       CUDA_CALL (cuMemcpyHtoDAsync,
1318                  (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
1319       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1320       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1321     }
1322   else
1323 #endif
1324     CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
1325
1326   return true;
1327 }
1328
1329 static bool
1330 nvptx_dev2host (void *h, const void *d, size_t s)
1331 {
1332   CUdeviceptr pb;
1333   size_t ps;
1334   struct nvptx_thread *nvthd = nvptx_thread ();
1335
1336   if (!s)
1337     return true;
1338   if (!d)
1339     {
1340       GOMP_PLUGIN_error ("invalid device address");
1341       return false;
1342     }
1343
1344   CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
1345
1346   if (!pb)
1347     {
1348       GOMP_PLUGIN_error ("invalid device address");
1349       return false;
1350     }
1351   if (!h)
1352     {
1353       GOMP_PLUGIN_error ("invalid host address");
1354       return false;
1355     }
1356   if (d == h)
1357     {
1358       GOMP_PLUGIN_error ("invalid host or device address");
1359       return false;
1360     }
1361   if ((void *)(d + s) > (void *)(pb + ps))
1362     {
1363       GOMP_PLUGIN_error ("invalid size");
1364       return false;
1365     }
1366
1367 #ifndef DISABLE_ASYNC
1368   if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
1369     {
1370       CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1371       CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1372       event_gc (false);
1373       CUDA_CALL (cuMemcpyDtoHAsync,
1374                  h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
1375       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
1376       event_add (PTX_EVT_MEM, e, (void *)h, 0);
1377     }
1378   else
1379 #endif
1380     CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
1381
1382   return true;
1383 }
1384
1385 static void
1386 nvptx_set_async (int async)
1387 {
1388   struct nvptx_thread *nvthd = nvptx_thread ();
1389   nvthd->current_stream
1390     = select_stream_for_async (async, pthread_self (), true, NULL);
1391 }
1392
1393 static int
1394 nvptx_async_test (int async)
1395 {
1396   CUresult r;
1397   struct ptx_stream *s;
1398
1399   s = select_stream_for_async (async, pthread_self (), false, NULL);
1400
1401   if (!s)
1402     GOMP_PLUGIN_fatal ("unknown async %d", async);
1403
1404   r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1405   if (r == CUDA_SUCCESS)
1406     {
1407       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1408          whether all work has completed on this stream, and if so omits the call
1409          to the wait hook.  If that happens, event_gc might not get called
1410          (which prevents variables from getting unmapped and their associated
1411          device storage freed), so call it here.  */
1412       event_gc (true);
1413       return 1;
1414     }
1415   else if (r == CUDA_ERROR_NOT_READY)
1416     return 0;
1417
1418   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1419
1420   return 0;
1421 }
1422
1423 static int
1424 nvptx_async_test_all (void)
1425 {
1426   struct ptx_stream *s;
1427   pthread_t self = pthread_self ();
1428   struct nvptx_thread *nvthd = nvptx_thread ();
1429
1430   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1431
1432   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1433     {
1434       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1435           && CUDA_CALL_NOCHECK (cuStreamQuery,
1436                                 s->stream) == CUDA_ERROR_NOT_READY)
1437         {
1438           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1439           return 0;
1440         }
1441     }
1442
1443   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1444
1445   event_gc (true);
1446
1447   return 1;
1448 }
1449
1450 static void
1451 nvptx_wait (int async)
1452 {
1453   struct ptx_stream *s;
1454
1455   s = select_stream_for_async (async, pthread_self (), false, NULL);
1456   if (!s)
1457     GOMP_PLUGIN_fatal ("unknown async %d", async);
1458
1459   CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1460
1461   event_gc (true);
1462 }
1463
1464 static void
1465 nvptx_wait_async (int async1, int async2)
1466 {
1467   CUevent *e;
1468   struct ptx_stream *s1, *s2;
1469   pthread_t self = pthread_self ();
1470
1471   /* The stream that is waiting (rather than being waited for) doesn't
1472      necessarily have to exist already.  */
1473   s2 = select_stream_for_async (async2, self, true, NULL);
1474
1475   s1 = select_stream_for_async (async1, self, false, NULL);
1476   if (!s1)
1477     GOMP_PLUGIN_fatal ("invalid async 1\n");
1478
1479   if (s1 == s2)
1480     GOMP_PLUGIN_fatal ("identical parameters");
1481
1482   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1483
1484   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1485
1486   event_gc (true);
1487
1488   CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
1489
1490   event_add (PTX_EVT_SYNC, e, NULL, 0);
1491
1492   CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
1493 }
1494
1495 static void
1496 nvptx_wait_all (void)
1497 {
1498   CUresult r;
1499   struct ptx_stream *s;
1500   pthread_t self = pthread_self ();
1501   struct nvptx_thread *nvthd = nvptx_thread ();
1502
1503   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1504
1505   /* Wait for active streams initiated by this thread (or by multiple threads)
1506      to complete.  */
1507   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1508     {
1509       if (s->multithreaded || pthread_equal (s->host_thread, self))
1510         {
1511           r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
1512           if (r == CUDA_SUCCESS)
1513             continue;
1514           else if (r != CUDA_ERROR_NOT_READY)
1515             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1516
1517           CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
1518         }
1519     }
1520
1521   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1522
1523   event_gc (true);
1524 }
1525
1526 static void
1527 nvptx_wait_all_async (int async)
1528 {
1529   struct ptx_stream *waiting_stream, *other_stream;
1530   CUevent *e;
1531   struct nvptx_thread *nvthd = nvptx_thread ();
1532   pthread_t self = pthread_self ();
1533
1534   /* The stream doing the waiting.  This could be the first mention of the
1535      stream, so create it if necessary.  */
1536   waiting_stream
1537     = select_stream_for_async (async, pthread_self (), true, NULL);
1538
1539   /* Launches on the null stream already block on other streams in the
1540      context.  */
1541   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1542     return;
1543
1544   event_gc (true);
1545
1546   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1547
1548   for (other_stream = nvthd->ptx_dev->active_streams;
1549        other_stream != NULL;
1550        other_stream = other_stream->next)
1551     {
1552       if (!other_stream->multithreaded
1553           && !pthread_equal (other_stream->host_thread, self))
1554         continue;
1555
1556       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1557
1558       CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1559
1560       /* Record an event on the waited-for stream.  */
1561       CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
1562
1563       event_add (PTX_EVT_SYNC, e, NULL, 0);
1564
1565       CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
1566    }
1567
1568   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1569 }
1570
1571 static void *
1572 nvptx_get_current_cuda_device (void)
1573 {
1574   struct nvptx_thread *nvthd = nvptx_thread ();
1575
1576   if (!nvthd || !nvthd->ptx_dev)
1577     return NULL;
1578
1579   return &nvthd->ptx_dev->dev;
1580 }
1581
1582 static void *
1583 nvptx_get_current_cuda_context (void)
1584 {
1585   struct nvptx_thread *nvthd = nvptx_thread ();
1586
1587   if (!nvthd || !nvthd->ptx_dev)
1588     return NULL;
1589
1590   return nvthd->ptx_dev->ctx;
1591 }
1592
1593 static void *
1594 nvptx_get_cuda_stream (int async)
1595 {
1596   struct ptx_stream *s;
1597   struct nvptx_thread *nvthd = nvptx_thread ();
1598
1599   if (!nvthd || !nvthd->ptx_dev)
1600     return NULL;
1601
1602   s = select_stream_for_async (async, pthread_self (), false, NULL);
1603
1604   return s ? s->stream : NULL;
1605 }
1606
1607 static int
1608 nvptx_set_cuda_stream (int async, void *stream)
1609 {
1610   struct ptx_stream *oldstream;
1611   pthread_t self = pthread_self ();
1612   struct nvptx_thread *nvthd = nvptx_thread ();
1613
1614   if (async < 0)
1615     GOMP_PLUGIN_fatal ("bad async %d", async);
1616
1617   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1618
1619   /* We have a list of active streams and an array mapping async values to
1620      entries of that list.  We need to take "ownership" of the passed-in stream,
1621      and add it to our list, removing the previous entry also (if there was one)
1622      in order to prevent resource leaks.  Note the potential for surprise
1623      here: maybe we should keep track of passed-in streams and leave it up to
1624      the user to tidy those up, but that doesn't work for stream handles
1625      returned from acc_get_cuda_stream above...  */
1626
1627   oldstream = select_stream_for_async (async, self, false, NULL);
1628
1629   if (oldstream)
1630     {
1631       if (nvthd->ptx_dev->active_streams == oldstream)
1632         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1633       else
1634         {
1635           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1636           while (s->next != oldstream)
1637             s = s->next;
1638           s->next = s->next->next;
1639         }
1640
1641       CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
1642
1643       if (!map_fini (oldstream))
1644         GOMP_PLUGIN_fatal ("error when freeing host memory");
1645
1646       free (oldstream);
1647     }
1648
1649   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1650
1651   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1652
1653   return 1;
1654 }
1655
1656 /* Plugin entry points.  */
1657
1658 const char *
1659 GOMP_OFFLOAD_get_name (void)
1660 {
1661   return "nvptx";
1662 }
1663
1664 unsigned int
1665 GOMP_OFFLOAD_get_caps (void)
1666 {
1667   return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
1668 }
1669
1670 int
1671 GOMP_OFFLOAD_get_type (void)
1672 {
1673   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1674 }
1675
1676 int
1677 GOMP_OFFLOAD_get_num_devices (void)
1678 {
1679   return nvptx_get_num_devices ();
1680 }
1681
1682 bool
1683 GOMP_OFFLOAD_init_device (int n)
1684 {
1685   struct ptx_device *dev;
1686
1687   pthread_mutex_lock (&ptx_dev_lock);
1688
1689   if (!nvptx_init () || ptx_devices[n] != NULL)
1690     {
1691       pthread_mutex_unlock (&ptx_dev_lock);
1692       return false;
1693     }
1694
1695   dev = nvptx_open_device (n);
1696   if (dev)
1697     {
1698       ptx_devices[n] = dev;
1699       instantiated_devices++;
1700     }
1701
1702   pthread_mutex_unlock (&ptx_dev_lock);
1703
1704   return dev != NULL;
1705 }
1706
1707 bool
1708 GOMP_OFFLOAD_fini_device (int n)
1709 {
1710   pthread_mutex_lock (&ptx_dev_lock);
1711
1712   if (ptx_devices[n] != NULL)
1713     {
1714       if (!nvptx_attach_host_thread_to_device (n)
1715           || !nvptx_close_device (ptx_devices[n]))
1716         {
1717           pthread_mutex_unlock (&ptx_dev_lock);
1718           return false;
1719         }
1720       ptx_devices[n] = NULL;
1721       instantiated_devices--;
1722     }
1723
1724   pthread_mutex_unlock (&ptx_dev_lock);
1725   return true;
1726 }
1727
1728 /* Return the libgomp version number we're compatible with.  There is
1729    no requirement for cross-version compatibility.  */
1730
1731 unsigned
1732 GOMP_OFFLOAD_version (void)
1733 {
1734   return GOMP_VERSION;
1735 }
1736
1737 /* Initialize __nvptx_clocktick, if present in MODULE.  */
1738
1739 static void
1740 nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
1741 {
1742   CUdeviceptr dptr;
1743   CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
1744                                   module, "__nvptx_clocktick");
1745   if (r == CUDA_ERROR_NOT_FOUND)
1746     return;
1747   if (r != CUDA_SUCCESS)
1748     GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1749   double __nvptx_clocktick = 1e-3 / dev->clock_khz;
1750   r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
1751                          sizeof (__nvptx_clocktick));
1752   if (r != CUDA_SUCCESS)
1753     GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1754 }
1755
1756 /* Load the (partial) program described by TARGET_DATA to device
1757    number ORD.  Allocate and return TARGET_TABLE.  */
1758
1759 int
1760 GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
1761                          struct addr_pair **target_table)
1762 {
1763   CUmodule module;
1764   const char *const *var_names;
1765   const struct targ_fn_launch *fn_descs;
1766   unsigned int fn_entries, var_entries, i, j;
1767   struct targ_fn_descriptor *targ_fns;
1768   struct addr_pair *targ_tbl;
1769   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1770   struct ptx_image_data *new_image;
1771   struct ptx_device *dev;
1772
1773   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1774     {
1775       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1776                          " (expected %u, received %u)",
1777                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1778       return -1;
1779     }
1780
1781   if (!nvptx_attach_host_thread_to_device (ord)
1782       || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
1783     return -1;
1784
1785   dev = ptx_devices[ord];
1786
1787   /* The mkoffload utility emits a struct of pointers/integers at the
1788      start of each offload image.  The array of kernel names and the
1789      functions addresses form a one-to-one correspondence.  */
1790
1791   var_entries = img_header->var_num;
1792   var_names = img_header->var_names;
1793   fn_entries = img_header->fn_num;
1794   fn_descs = img_header->fn_descs;
1795
1796   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1797                                  * (fn_entries + var_entries));
1798   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1799                                  * fn_entries);
1800
1801   *target_table = targ_tbl;
1802
1803   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1804   new_image->target_data = target_data;
1805   new_image->module = module;
1806   new_image->fns = targ_fns;
1807
1808   pthread_mutex_lock (&dev->image_lock);
1809   new_image->next = dev->images;
1810   dev->images = new_image;
1811   pthread_mutex_unlock (&dev->image_lock);
1812
1813   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1814     {
1815       CUfunction function;
1816       int nregs, mthrs;
1817
1818       CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
1819                       fn_descs[i].fn);
1820       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
1821                       CU_FUNC_ATTRIBUTE_NUM_REGS, function);
1822       CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
1823                       CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
1824
1825       targ_fns->fn = function;
1826       targ_fns->launch = &fn_descs[i];
1827       targ_fns->regs_per_thread = nregs;
1828       targ_fns->max_threads_per_block = mthrs;
1829
1830       targ_tbl->start = (uintptr_t) targ_fns;
1831       targ_tbl->end = targ_tbl->start + 1;
1832     }
1833
1834   for (j = 0; j < var_entries; j++, targ_tbl++)
1835     {
1836       CUdeviceptr var;
1837       size_t bytes;
1838
1839       CUDA_CALL_ERET (-1, cuModuleGetGlobal,
1840                       &var, &bytes, module, var_names[j]);
1841
1842       targ_tbl->start = (uintptr_t) var;
1843       targ_tbl->end = targ_tbl->start + bytes;
1844     }
1845
1846   nvptx_set_clocktick (module, dev);
1847
1848   return fn_entries + var_entries;
1849 }
1850
1851 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1852    function descriptors allocated by G_O_load_image.  */
1853
1854 bool
1855 GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
1856 {
1857   struct ptx_image_data *image, **prev_p;
1858   struct ptx_device *dev = ptx_devices[ord];
1859
1860   if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
1861     {
1862       GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
1863                          " (expected %u, received %u)",
1864                          GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
1865       return false;
1866     }
1867
1868   bool ret = true;
1869   pthread_mutex_lock (&dev->image_lock);
1870   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1871     if (image->target_data == target_data)
1872       {
1873         *prev_p = image->next;
1874         if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
1875           ret = false;
1876         free (image->fns);
1877         free (image);
1878         break;
1879       }
1880   pthread_mutex_unlock (&dev->image_lock);
1881   return ret;
1882 }
1883
1884 void *
1885 GOMP_OFFLOAD_alloc (int ord, size_t size)
1886 {
1887   if (!nvptx_attach_host_thread_to_device (ord))
1888     return NULL;
1889   return nvptx_alloc (size);
1890 }
1891
1892 bool
1893 GOMP_OFFLOAD_free (int ord, void *ptr)
1894 {
1895   return (nvptx_attach_host_thread_to_device (ord)
1896           && nvptx_free (ptr));
1897 }
1898
1899 bool
1900 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1901 {
1902   return (nvptx_attach_host_thread_to_device (ord)
1903           && nvptx_dev2host (dst, src, n));
1904 }
1905
1906 bool
1907 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1908 {
1909   return (nvptx_attach_host_thread_to_device (ord)
1910           && nvptx_host2dev (dst, src, n));
1911 }
1912
1913 bool
1914 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
1915 {
1916   struct ptx_device *ptx_dev = ptx_devices[ord];
1917   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
1918                                 ptx_dev->null_stream->stream);
1919   return true;
1920 }
1921
1922 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1923
1924 void
1925 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1926                                void **hostaddrs, void **devaddrs,
1927                                int async, unsigned *dims, void *targ_mem_desc)
1928 {
1929   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, async, dims, targ_mem_desc);
1930 }
1931
1932 void
1933 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
1934 {
1935   struct nvptx_thread *nvthd = nvptx_thread ();
1936   CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1937
1938   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
1939   CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
1940   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
1941 }
1942
1943 int
1944 GOMP_OFFLOAD_openacc_async_test (int async)
1945 {
1946   return nvptx_async_test (async);
1947 }
1948
1949 int
1950 GOMP_OFFLOAD_openacc_async_test_all (void)
1951 {
1952   return nvptx_async_test_all ();
1953 }
1954
1955 void
1956 GOMP_OFFLOAD_openacc_async_wait (int async)
1957 {
1958   nvptx_wait (async);
1959 }
1960
1961 void
1962 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1963 {
1964   nvptx_wait_async (async1, async2);
1965 }
1966
1967 void
1968 GOMP_OFFLOAD_openacc_async_wait_all (void)
1969 {
1970   nvptx_wait_all ();
1971 }
1972
1973 void
1974 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1975 {
1976   nvptx_wait_all_async (async);
1977 }
1978
1979 void
1980 GOMP_OFFLOAD_openacc_async_set_async (int async)
1981 {
1982   nvptx_set_async (async);
1983 }
1984
1985 void *
1986 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1987 {
1988   struct ptx_device *ptx_dev;
1989   struct nvptx_thread *nvthd
1990     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1991   CUcontext thd_ctx;
1992
1993   ptx_dev = ptx_devices[ord];
1994
1995   assert (ptx_dev);
1996
1997   CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
1998
1999   assert (ptx_dev->ctx);
2000
2001   if (!thd_ctx)
2002     CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
2003
2004   nvthd->current_stream = ptx_dev->null_stream;
2005   nvthd->ptx_dev = ptx_dev;
2006
2007   return (void *) nvthd;
2008 }
2009
2010 void
2011 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
2012 {
2013   free (data);
2014 }
2015
2016 void *
2017 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
2018 {
2019   return nvptx_get_current_cuda_device ();
2020 }
2021
2022 void *
2023 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
2024 {
2025   return nvptx_get_current_cuda_context ();
2026 }
2027
2028 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
2029
2030 void *
2031 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
2032 {
2033   return nvptx_get_cuda_stream (async);
2034 }
2035
2036 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
2037
2038 int
2039 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
2040 {
2041   return nvptx_set_cuda_stream (async, stream);
2042 }
2043
2044 /* Adjust launch dimensions: pick good values for number of blocks and warps
2045    and ensure that number of warps does not exceed CUDA limits as well as GCC's
2046    own limits.  */
2047
2048 static void
2049 nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
2050                             struct ptx_device *ptx_dev,
2051                             int *teams_p, int *threads_p)
2052 {
2053   int max_warps_block = fn->max_threads_per_block / 32;
2054   /* Maximum 32 warps per block is an implementation limit in NVPTX backend
2055      and libgcc, which matches documented limit of all GPUs as of 2015.  */
2056   if (max_warps_block > 32)
2057     max_warps_block = 32;
2058   if (*threads_p <= 0)
2059     *threads_p = 8;
2060   if (*threads_p > max_warps_block)
2061     *threads_p = max_warps_block;
2062
2063   int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
2064   /* This is an estimate of how many blocks the device can host simultaneously.
2065      Actual limit, which may be lower, can be queried with "occupancy control"
2066      driver interface (since CUDA 6.0).  */
2067   int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
2068   if (*teams_p <= 0 || *teams_p > max_blocks)
2069     *teams_p = max_blocks;
2070 }
2071
2072 /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
2073    target regions.  */
2074
2075 static size_t
2076 nvptx_stacks_size ()
2077 {
2078   return 128 * 1024;
2079 }
2080
2081 /* Return contiguous storage for NUM stacks, each SIZE bytes.  */
2082
2083 static void *
2084 nvptx_stacks_alloc (size_t size, int num)
2085 {
2086   CUdeviceptr stacks;
2087   CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &stacks, size * num);
2088   if (r != CUDA_SUCCESS)
2089     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
2090   return (void *) stacks;
2091 }
2092
2093 /* Release storage previously allocated by nvptx_stacks_alloc.  */
2094
2095 static void
2096 nvptx_stacks_free (void *p, int num)
2097 {
2098   CUresult r = CUDA_CALL_NOCHECK (cuMemFree, (CUdeviceptr) p);
2099   if (r != CUDA_SUCCESS)
2100     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
2101 }
2102
2103 void
2104 GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
2105 {
2106   CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
2107   CUresult r;
2108   struct ptx_device *ptx_dev = ptx_devices[ord];
2109   const char *maybe_abort_msg = "(perhaps abort was called)";
2110   int teams = 0, threads = 0;
2111
2112   if (!args)
2113     GOMP_PLUGIN_fatal ("No target arguments provided");
2114   while (*args)
2115     {
2116       intptr_t id = (intptr_t) *args++, val;
2117       if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
2118         val = (intptr_t) *args++;
2119       else
2120         val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
2121       if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
2122         continue;
2123       val = val > INT_MAX ? INT_MAX : val;
2124       id &= GOMP_TARGET_ARG_ID_MASK;
2125       if (id == GOMP_TARGET_ARG_NUM_TEAMS)
2126         teams = val;
2127       else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
2128         threads = val;
2129     }
2130   nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
2131
2132   size_t stack_size = nvptx_stacks_size ();
2133   void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
2134   void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
2135   size_t fn_args_size = sizeof fn_args;
2136   void *config[] = {
2137     CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
2138     CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
2139     CU_LAUNCH_PARAM_END
2140   };
2141   r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
2142                          32, threads, 1, 0, ptx_dev->null_stream->stream,
2143                          NULL, config);
2144   if (r != CUDA_SUCCESS)
2145     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
2146
2147   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
2148   if (r == CUDA_ERROR_LAUNCH_FAILED)
2149     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
2150                        maybe_abort_msg);
2151   else if (r != CUDA_SUCCESS)
2152     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
2153   nvptx_stacks_free (stacks, teams * threads);
2154 }
2155
2156 void
2157 GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
2158                         void *async_data)
2159 {
2160   GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
2161 }