libgomp/plugin/plugin-nvptx.c

   1 /* Plugin for NVPTX execution.
   2
   3    Copyright (C) 2013-2015 Free Software Foundation, Inc.
   4
   5    Contributed by Mentor Embedded.
   6
   7    This file is part of the GNU Offloading and Multi Processing Library
   8    (libgomp).
   9
  10    Libgomp is free software; you can redistribute it and/or modify it
  11    under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 3, or (at your option)
  13    any later version.
  14
  15    Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
  16    WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  17    FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  18    more details.
  19
  20    Under Section 7 of GPL version 3, you are granted additional
  21    permissions described in the GCC Runtime Library Exception, version
  22    3.1, as published by the Free Software Foundation.
  23
  24    You should have received a copy of the GNU General Public License and
  25    a copy of the GCC Runtime Library Exception along with this program;
  26    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  27    <http://www.gnu.org/licenses/>.  */
  28
  29 /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
  30    library appears to hold some implicit state, but the documentation
  31    is not clear as to what that state might be.  Or how one might
  32    propagate it from one thread to another.  */
  33
  34 #include "openacc.h"
  35 #include "config.h"
  36 #include "libgomp-plugin.h"
  37 #include "oacc-ptx.h"
  38 #include "oacc-plugin.h"
  39
  40 #include <pthread.h>
  41 #include <cuda.h>
  42 #include <stdbool.h>
  43 #include <stdint.h>
  44 #include <string.h>
  45 #include <stdio.h>
  46 #include <dlfcn.h>
  47 #include <unistd.h>
  48 #include <assert.h>
  49
  50 #define ARRAYSIZE(X) (sizeof (X) / sizeof ((X)[0]))
  51
  52 static struct
  53 {
  54   CUresult r;
  55   char *m;
  56 } cuda_errlist[]=
  57 {
  58   { CUDA_ERROR_INVALID_VALUE, "invalid value" },
  59   { CUDA_ERROR_OUT_OF_MEMORY, "out of memory" },
  60   { CUDA_ERROR_NOT_INITIALIZED, "not initialized" },
  61   { CUDA_ERROR_DEINITIALIZED, "deinitialized" },
  62   { CUDA_ERROR_PROFILER_DISABLED, "profiler disabled" },
  63   { CUDA_ERROR_PROFILER_NOT_INITIALIZED, "profiler not initialized" },
  64   { CUDA_ERROR_PROFILER_ALREADY_STARTED, "already started" },
  65   { CUDA_ERROR_PROFILER_ALREADY_STOPPED, "already stopped" },
  66   { CUDA_ERROR_NO_DEVICE, "no device" },
  67   { CUDA_ERROR_INVALID_DEVICE, "invalid device" },
  68   { CUDA_ERROR_INVALID_IMAGE, "invalid image" },
  69   { CUDA_ERROR_INVALID_CONTEXT, "invalid context" },
  70   { CUDA_ERROR_CONTEXT_ALREADY_CURRENT, "context already current" },
  71   { CUDA_ERROR_MAP_FAILED, "map error" },
  72   { CUDA_ERROR_UNMAP_FAILED, "unmap error" },
  73   { CUDA_ERROR_ARRAY_IS_MAPPED, "array is mapped" },
  74   { CUDA_ERROR_ALREADY_MAPPED, "already mapped" },
  75   { CUDA_ERROR_NO_BINARY_FOR_GPU, "no binary for gpu" },
  76   { CUDA_ERROR_ALREADY_ACQUIRED, "already acquired" },
  77   { CUDA_ERROR_NOT_MAPPED, "not mapped" },
  78   { CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "not mapped as array" },
  79   { CUDA_ERROR_NOT_MAPPED_AS_POINTER, "not mapped as pointer" },
  80   { CUDA_ERROR_ECC_UNCORRECTABLE, "ecc uncorrectable" },
  81   { CUDA_ERROR_UNSUPPORTED_LIMIT, "unsupported limit" },
  82   { CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "context already in use" },
  83   { CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, "peer access unsupported" },
  84   { CUDA_ERROR_INVALID_SOURCE, "invalid source" },
  85   { CUDA_ERROR_FILE_NOT_FOUND, "file not found" },
  86   { CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
  87                                            "shared object symbol not found" },
  88   { CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "shared object init error" },
  89   { CUDA_ERROR_OPERATING_SYSTEM, "operating system" },
  90   { CUDA_ERROR_INVALID_HANDLE, "invalid handle" },
  91   { CUDA_ERROR_NOT_FOUND, "not found" },
  92   { CUDA_ERROR_NOT_READY, "not ready" },
  93   { CUDA_ERROR_LAUNCH_FAILED, "launch error" },
  94   { CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "launch out of resources" },
  95   { CUDA_ERROR_LAUNCH_TIMEOUT, "launch timeout" },
  96   { CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
  97                                              "launch incompatibe texturing" },
  98   { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, "peer access already enabled" },
  99   { CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, "peer access not enabled " },
 100   { CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "primary cotext active" },
 101   { CUDA_ERROR_CONTEXT_IS_DESTROYED, "context is destroyed" },
 102   { CUDA_ERROR_ASSERT, "assert" },
 103   { CUDA_ERROR_TOO_MANY_PEERS, "too many peers" },
 104   { CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
 105                                            "host memory already registered" },
 106   { CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, "host memory not registered" },
 107   { CUDA_ERROR_NOT_PERMITTED, "not permitted" },
 108   { CUDA_ERROR_NOT_SUPPORTED, "not supported" },
 109   { CUDA_ERROR_UNKNOWN, "unknown" }
 110 };
 111
 112 static char errmsg[128];
 113
 114 static char *
 115 cuda_error (CUresult r)
 116 {
 117   int i;
 118
 119   for (i = 0; i < ARRAYSIZE (cuda_errlist); i++)
 120     {
 121       if (cuda_errlist[i].r == r)
 122         return &cuda_errlist[i].m[0];
 123     }
 124
 125   sprintf (&errmsg[0], "unknown result code: %5d", r);
 126
 127   return &errmsg[0];
 128 }
 129
 130 static unsigned int instantiated_devices = 0;
 131 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 132
 133 struct ptx_stream
 134 {
 135   CUstream stream;
 136   pthread_t host_thread;
 137   bool multithreaded;
 138
 139   CUdeviceptr d;
 140   void *h;
 141   void *h_begin;
 142   void *h_end;
 143   void *h_next;
 144   void *h_prev;
 145   void *h_tail;
 146
 147   struct ptx_stream *next;
 148 };
 149
 150 /* Thread-specific data for PTX.  */
 151
 152 struct nvptx_thread
 153 {
 154   struct ptx_stream *current_stream;
 155   struct ptx_device *ptx_dev;
 156 };
 157
 158 struct map
 159 {
 160   int     async;
 161   size_t  size;
 162   char    mappings[0];
 163 };
 164
 165 static void
 166 map_init (struct ptx_stream *s)
 167 {
 168   CUresult r;
 169
 170   int size = getpagesize ();
 171
 172   assert (s);
 173   assert (!s->d);
 174   assert (!s->h);
 175
 176   r = cuMemAllocHost (&s->h, size);
 177   if (r != CUDA_SUCCESS)
 178     GOMP_PLUGIN_fatal ("cuMemAllocHost error: %s", cuda_error (r));
 179
 180   r = cuMemHostGetDevicePointer (&s->d, s->h, 0);
 181   if (r != CUDA_SUCCESS)
 182     GOMP_PLUGIN_fatal ("cuMemHostGetDevicePointer error: %s", cuda_error (r));
 183
 184   assert (s->h);
 185
 186   s->h_begin = s->h;
 187   s->h_end = s->h_begin + size;
 188   s->h_next = s->h_prev = s->h_tail = s->h_begin;
 189
 190   assert (s->h_next);
 191   assert (s->h_end);
 192 }
 193
 194 static void
 195 map_fini (struct ptx_stream *s)
 196 {
 197   CUresult r;
 198
 199   r = cuMemFreeHost (s->h);
 200   if (r != CUDA_SUCCESS)
 201     GOMP_PLUGIN_fatal ("cuMemFreeHost error: %s", cuda_error (r));
 202 }
 203
 204 static void
 205 map_pop (struct ptx_stream *s)
 206 {
 207   struct map *m;
 208
 209   assert (s != NULL);
 210   assert (s->h_next);
 211   assert (s->h_prev);
 212   assert (s->h_tail);
 213
 214   m = s->h_tail;
 215
 216   s->h_tail += m->size;
 217
 218   if (s->h_tail >= s->h_end)
 219     s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
 220
 221   if (s->h_next == s->h_tail)
 222     s->h_prev = s->h_next;
 223
 224   assert (s->h_next >= s->h_begin);
 225   assert (s->h_tail >= s->h_begin);
 226   assert (s->h_prev >= s->h_begin);
 227
 228   assert (s->h_next <= s->h_end);
 229   assert (s->h_tail <= s->h_end);
 230   assert (s->h_prev <= s->h_end);
 231 }
 232
 233 static void
 234 map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
 235 {
 236   int left;
 237   int offset;
 238   struct map *m;
 239
 240   assert (s != NULL);
 241
 242   left = s->h_end - s->h_next;
 243   size += sizeof (struct map);
 244
 245   assert (s->h_prev);
 246   assert (s->h_next);
 247
 248   if (size >= left)
 249     {
 250       m = s->h_prev;
 251       m->size += left;
 252       s->h_next = s->h_begin;
 253
 254       if (s->h_next + size > s->h_end)
 255         GOMP_PLUGIN_fatal ("unable to push map");
 256     }
 257
 258   assert (s->h_next);
 259
 260   m = s->h_next;
 261   m->async = async;
 262   m->size = size;
 263
 264   offset = (void *)&m->mappings[0] - s->h;
 265
 266   *d = (void *)(s->d + offset);
 267   *h = (void *)(s->h + offset);
 268
 269   s->h_prev = s->h_next;
 270   s->h_next += size;
 271
 272   assert (s->h_prev);
 273   assert (s->h_next);
 274
 275   assert (s->h_next >= s->h_begin);
 276   assert (s->h_tail >= s->h_begin);
 277   assert (s->h_prev >= s->h_begin);
 278   assert (s->h_next <= s->h_end);
 279   assert (s->h_tail <= s->h_end);
 280   assert (s->h_prev <= s->h_end);
 281
 282   return;
 283 }
 284
 285 /* Descriptor of a loaded function.  */
 286
 287 struct targ_fn_descriptor
 288 {
 289   CUfunction fn;
 290   const char *name;
 291 };
 292
 293 /* A loaded PTX image.  */
 294 struct ptx_image_data
 295 {
 296   const void *target_data;
 297   CUmodule module;
 298
 299   struct targ_fn_descriptor *fns;  /* Array of functions.  */
 300
 301   struct ptx_image_data *next;
 302 };
 303
 304 struct ptx_device
 305 {
 306   CUcontext ctx;
 307   bool ctx_shared;
 308   CUdevice dev;
 309   struct ptx_stream *null_stream;
 310   /* All non-null streams associated with this device (actually context),
 311      either created implicitly or passed in from the user (via
 312      acc_set_cuda_stream).  */
 313   struct ptx_stream *active_streams;
 314   struct {
 315     struct ptx_stream **arr;
 316     int size;
 317   } async_streams;
 318   /* A lock for use when manipulating the above stream list and array.  */
 319   pthread_mutex_t stream_lock;
 320   int ord;
 321   bool overlap;
 322   bool map;
 323   bool concur;
 324   int  mode;
 325   bool mkern;
 326
 327   struct ptx_image_data *images;  /* Images loaded on device.  */
 328   pthread_mutex_t image_lock;     /* Lock for above list.  */
 329
 330   struct ptx_device *next;
 331 };
 332
 333 enum ptx_event_type
 334 {
 335   PTX_EVT_MEM,
 336   PTX_EVT_KNL,
 337   PTX_EVT_SYNC,
 338   PTX_EVT_ASYNC_CLEANUP
 339 };
 340
 341 struct ptx_event
 342 {
 343   CUevent *evt;
 344   int type;
 345   void *addr;
 346   int ord;
 347
 348   struct ptx_event *next;
 349 };
 350
 351 static pthread_mutex_t ptx_event_lock;
 352 static struct ptx_event *ptx_events;
 353
 354 static struct ptx_device **ptx_devices;
 355
 356 #define _XSTR(s) _STR(s)
 357 #define _STR(s) #s
 358
 359 static struct _synames
 360 {
 361   char *n;
 362 } cuda_symnames[] =
 363 {
 364   { _XSTR (cuCtxCreate) },
 365   { _XSTR (cuCtxDestroy) },
 366   { _XSTR (cuCtxGetCurrent) },
 367   { _XSTR (cuCtxPushCurrent) },
 368   { _XSTR (cuCtxSynchronize) },
 369   { _XSTR (cuDeviceGet) },
 370   { _XSTR (cuDeviceGetAttribute) },
 371   { _XSTR (cuDeviceGetCount) },
 372   { _XSTR (cuEventCreate) },
 373   { _XSTR (cuEventDestroy) },
 374   { _XSTR (cuEventQuery) },
 375   { _XSTR (cuEventRecord) },
 376   { _XSTR (cuInit) },
 377   { _XSTR (cuLaunchKernel) },
 378   { _XSTR (cuLinkAddData) },
 379   { _XSTR (cuLinkComplete) },
 380   { _XSTR (cuLinkCreate) },
 381   { _XSTR (cuMemAlloc) },
 382   { _XSTR (cuMemAllocHost) },
 383   { _XSTR (cuMemcpy) },
 384   { _XSTR (cuMemcpyDtoH) },
 385   { _XSTR (cuMemcpyDtoHAsync) },
 386   { _XSTR (cuMemcpyHtoD) },
 387   { _XSTR (cuMemcpyHtoDAsync) },
 388   { _XSTR (cuMemFree) },
 389   { _XSTR (cuMemFreeHost) },
 390   { _XSTR (cuMemGetAddressRange) },
 391   { _XSTR (cuMemHostGetDevicePointer) },
 392   { _XSTR (cuMemHostRegister) },
 393   { _XSTR (cuMemHostUnregister) },
 394   { _XSTR (cuModuleGetFunction) },
 395   { _XSTR (cuModuleLoadData) },
 396   { _XSTR (cuStreamDestroy) },
 397   { _XSTR (cuStreamQuery) },
 398   { _XSTR (cuStreamSynchronize) },
 399   { _XSTR (cuStreamWaitEvent) }
 400 };
 401
 402 static int
 403 verify_device_library (void)
 404 {
 405   int i;
 406   void *dh, *ds;
 407
 408   dh = dlopen ("libcuda.so", RTLD_LAZY);
 409   if (!dh)
 410     return -1;
 411
 412   for (i = 0; i < ARRAYSIZE (cuda_symnames); i++)
 413     {
 414       ds = dlsym (dh, cuda_symnames[i].n);
 415       if (!ds)
 416         return -1;
 417     }
 418
 419   dlclose (dh);
 420
 421   return 0;
 422 }
 423
 424 static inline struct nvptx_thread *
 425 nvptx_thread (void)
 426 {
 427   return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
 428 }
 429
 430 static void
 431 init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 432 {
 433   int i;
 434   struct ptx_stream *null_stream
 435     = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 436
 437   null_stream->stream = NULL;
 438   null_stream->host_thread = pthread_self ();
 439   null_stream->multithreaded = true;
 440   null_stream->d = (CUdeviceptr) NULL;
 441   null_stream->h = NULL;
 442   map_init (null_stream);
 443   ptx_dev->null_stream = null_stream;
 444
 445   ptx_dev->active_streams = NULL;
 446   pthread_mutex_init (&ptx_dev->stream_lock, NULL);
 447
 448   if (concurrency < 1)
 449     concurrency = 1;
 450
 451   /* This is just a guess -- make space for as many async streams as the
 452      current device is capable of concurrently executing.  This can grow
 453      later as necessary.  No streams are created yet.  */
 454   ptx_dev->async_streams.arr
 455     = GOMP_PLUGIN_malloc (concurrency * sizeof (struct ptx_stream *));
 456   ptx_dev->async_streams.size = concurrency;
 457
 458   for (i = 0; i < concurrency; i++)
 459     ptx_dev->async_streams.arr[i] = NULL;
 460 }
 461
 462 static void
 463 fini_streams_for_device (struct ptx_device *ptx_dev)
 464 {
 465   free (ptx_dev->async_streams.arr);
 466
 467   while (ptx_dev->active_streams != NULL)
 468     {
 469       struct ptx_stream *s = ptx_dev->active_streams;
 470       ptx_dev->active_streams = ptx_dev->active_streams->next;
 471
 472       map_fini (s);
 473       cuStreamDestroy (s->stream);
 474       free (s);
 475     }
 476
 477   map_fini (ptx_dev->null_stream);
 478   free (ptx_dev->null_stream);
 479 }
 480
 481 /* Select a stream for (OpenACC-semantics) ASYNC argument for the current
 482    thread THREAD (and also current device/context).  If CREATE is true, create
 483    the stream if it does not exist (or use EXISTING if it is non-NULL), and
 484    associate the stream with the same thread argument.  Returns stream to use
 485    as result.  */
 486
 487 static struct ptx_stream *
 488 select_stream_for_async (int async, pthread_t thread, bool create,
 489                          CUstream existing)
 490 {
 491   struct nvptx_thread *nvthd = nvptx_thread ();
 492   /* Local copy of TLS variable.  */
 493   struct ptx_device *ptx_dev = nvthd->ptx_dev;
 494   struct ptx_stream *stream = NULL;
 495   int orig_async = async;
 496
 497   /* The special value acc_async_noval (-1) maps (for now) to an
 498      implicitly-created stream, which is then handled the same as any other
 499      numbered async stream.  Other options are available, e.g. using the null
 500      stream for anonymous async operations, or choosing an idle stream from an
 501      active set.  But, stick with this for now.  */
 502   if (async > acc_async_sync)
 503     async++;
 504
 505   if (create)
 506     pthread_mutex_lock (&ptx_dev->stream_lock);
 507
 508   /* NOTE: AFAICT there's no particular need for acc_async_sync to map to the
 509      null stream, and in fact better performance may be obtainable if it doesn't
 510      (because the null stream enforces overly-strict synchronisation with
 511      respect to other streams for legacy reasons, and that's probably not
 512      needed with OpenACC).  Maybe investigate later.  */
 513   if (async == acc_async_sync)
 514     stream = ptx_dev->null_stream;
 515   else if (async >= 0 && async < ptx_dev->async_streams.size
 516            && ptx_dev->async_streams.arr[async] && !(create && existing))
 517     stream = ptx_dev->async_streams.arr[async];
 518   else if (async >= 0 && create)
 519     {
 520       if (async >= ptx_dev->async_streams.size)
 521         {
 522           int i, newsize = ptx_dev->async_streams.size * 2;
 523
 524           if (async >= newsize)
 525             newsize = async + 1;
 526
 527           ptx_dev->async_streams.arr
 528             = GOMP_PLUGIN_realloc (ptx_dev->async_streams.arr,
 529                                    newsize * sizeof (struct ptx_stream *));
 530
 531           for (i = ptx_dev->async_streams.size; i < newsize; i++)
 532             ptx_dev->async_streams.arr[i] = NULL;
 533
 534           ptx_dev->async_streams.size = newsize;
 535         }
 536
 537       /* Create a new stream on-demand if there isn't one already, or if we're
 538          setting a particular async value to an existing (externally-provided)
 539          stream.  */
 540       if (!ptx_dev->async_streams.arr[async] || existing)
 541         {
 542           CUresult r;
 543           struct ptx_stream *s
 544             = GOMP_PLUGIN_malloc (sizeof (struct ptx_stream));
 545
 546           if (existing)
 547             s->stream = existing;
 548           else
 549             {
 550               r = cuStreamCreate (&s->stream, CU_STREAM_DEFAULT);
 551               if (r != CUDA_SUCCESS)
 552                 GOMP_PLUGIN_fatal ("cuStreamCreate error: %s", cuda_error (r));
 553             }
 554
 555           /* If CREATE is true, we're going to be queueing some work on this
 556              stream.  Associate it with the current host thread.  */
 557           s->host_thread = thread;
 558           s->multithreaded = false;
 559
 560           s->d = (CUdeviceptr) NULL;
 561           s->h = NULL;
 562           map_init (s);
 563
 564           s->next = ptx_dev->active_streams;
 565           ptx_dev->active_streams = s;
 566           ptx_dev->async_streams.arr[async] = s;
 567         }
 568
 569       stream = ptx_dev->async_streams.arr[async];
 570     }
 571   else if (async < 0)
 572     GOMP_PLUGIN_fatal ("bad async %d", async);
 573
 574   if (create)
 575     {
 576       assert (stream != NULL);
 577
 578       /* If we're trying to use the same stream from different threads
 579          simultaneously, set stream->multithreaded to true.  This affects the
 580          behaviour of acc_async_test_all and acc_wait_all, which are supposed to
 581          only wait for asynchronous launches from the same host thread they are
 582          invoked on.  If multiple threads use the same async value, we make note
 583          of that here and fall back to testing/waiting for all threads in those
 584          functions.  */
 585       if (thread != stream->host_thread)
 586         stream->multithreaded = true;
 587
 588       pthread_mutex_unlock (&ptx_dev->stream_lock);
 589     }
 590   else if (stream && !stream->multithreaded
 591            && !pthread_equal (stream->host_thread, thread))
 592     GOMP_PLUGIN_fatal ("async %d used on wrong thread", orig_async);
 593
 594   return stream;
 595 }
 596
 597 /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
 598    should be locked on entry and remains locked on exit.  */
 599
 600 static bool
 601 nvptx_init (void)
 602 {
 603   CUresult r;
 604   int rc;
 605   int ndevs;
 606
 607   if (instantiated_devices != 0)
 608     return true;
 609
 610   rc = verify_device_library ();
 611   if (rc < 0)
 612     return false;
 613
 614   r = cuInit (0);
 615   if (r != CUDA_SUCCESS)
 616     GOMP_PLUGIN_fatal ("cuInit error: %s", cuda_error (r));
 617
 618   ptx_events = NULL;
 619
 620   pthread_mutex_init (&ptx_event_lock, NULL);
 621
 622   r = cuDeviceGetCount (&ndevs);
 623   if (r != CUDA_SUCCESS)
 624     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 625
 626   ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
 627                                             * ndevs);
 628
 629   return true;
 630 }
 631
 632 /* Select the N'th PTX device for the current host thread.  The device must
 633    have been previously opened before calling this function.  */
 634
 635 static void
 636 nvptx_attach_host_thread_to_device (int n)
 637 {
 638   CUdevice dev;
 639   CUresult r;
 640   struct ptx_device *ptx_dev;
 641   CUcontext thd_ctx;
 642
 643   r = cuCtxGetDevice (&dev);
 644   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 645     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 646
 647   if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
 648     return;
 649   else
 650     {
 651       CUcontext old_ctx;
 652
 653       ptx_dev = ptx_devices[n];
 654       assert (ptx_dev);
 655
 656       r = cuCtxGetCurrent (&thd_ctx);
 657       if (r != CUDA_SUCCESS)
 658         GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 659
 660       /* We don't necessarily have a current context (e.g. if it has been
 661          destroyed.  Pop it if we do though.  */
 662       if (thd_ctx != NULL)
 663         {
 664           r = cuCtxPopCurrent (&old_ctx);
 665           if (r != CUDA_SUCCESS)
 666             GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 667         }
 668
 669       r = cuCtxPushCurrent (ptx_dev->ctx);
 670       if (r != CUDA_SUCCESS)
 671         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
 672     }
 673 }
 674
 675 static struct ptx_device *
 676 nvptx_open_device (int n)
 677 {
 678   struct ptx_device *ptx_dev;
 679   CUdevice dev, ctx_dev;
 680   CUresult r;
 681   int async_engines, pi;
 682
 683   r = cuDeviceGet (&dev, n);
 684   if (r != CUDA_SUCCESS)
 685     GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));
 686
 687   ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
 688
 689   ptx_dev->ord = n;
 690   ptx_dev->dev = dev;
 691   ptx_dev->ctx_shared = false;
 692
 693   r = cuCtxGetDevice (&ctx_dev);
 694   if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
 695     GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
 696
 697   if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
 698     {
 699       /* The current host thread has an active context for a different device.
 700          Detach it.  */
 701       CUcontext old_ctx;
 702
 703       r = cuCtxPopCurrent (&old_ctx);
 704       if (r != CUDA_SUCCESS)
 705         GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
 706     }
 707
 708   r = cuCtxGetCurrent (&ptx_dev->ctx);
 709   if (r != CUDA_SUCCESS)
 710     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
 711
 712   if (!ptx_dev->ctx)
 713     {
 714       r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
 715       if (r != CUDA_SUCCESS)
 716         GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
 717     }
 718   else
 719     ptx_dev->ctx_shared = true;
 720
 721   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
 722   if (r != CUDA_SUCCESS)
 723     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 724
 725   ptx_dev->overlap = pi;
 726
 727   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
 728   if (r != CUDA_SUCCESS)
 729     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 730
 731   ptx_dev->map = pi;
 732
 733   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
 734   if (r != CUDA_SUCCESS)
 735     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 736
 737   ptx_dev->concur = pi;
 738
 739   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
 740   if (r != CUDA_SUCCESS)
 741     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 742
 743   ptx_dev->mode = pi;
 744
 745   r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
 746   if (r != CUDA_SUCCESS)
 747     GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));
 748
 749   ptx_dev->mkern = pi;
 750
 751   r = cuDeviceGetAttribute (&async_engines,
 752                             CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
 753   if (r != CUDA_SUCCESS)
 754     async_engines = 1;
 755
 756   ptx_dev->images = NULL;
 757   pthread_mutex_init (&ptx_dev->image_lock, NULL);
 758
 759   init_streams_for_device (ptx_dev, async_engines);
 760
 761   return ptx_dev;
 762 }
 763
 764 static void
 765 nvptx_close_device (struct ptx_device *ptx_dev)
 766 {
 767   CUresult r;
 768
 769   if (!ptx_dev)
 770     return;
 771
 772   fini_streams_for_device (ptx_dev);
 773
 774   pthread_mutex_destroy (&ptx_dev->image_lock);
 775
 776   if (!ptx_dev->ctx_shared)
 777     {
 778       r = cuCtxDestroy (ptx_dev->ctx);
 779       if (r != CUDA_SUCCESS)
 780         GOMP_PLUGIN_fatal ("cuCtxDestroy error: %s", cuda_error (r));
 781     }
 782
 783   free (ptx_dev);
 784 }
 785
 786 static int
 787 nvptx_get_num_devices (void)
 788 {
 789   int n;
 790   CUresult r;
 791
 792   /* PR libgomp/65099: Currently, we only support offloading in 64-bit
 793      configurations.  */
 794   if (sizeof (void *) != 8)
 795     return 0;
 796
 797   /* This function will be called before the plugin has been initialized in
 798      order to enumerate available devices, but CUDA API routines can't be used
 799      until cuInit has been called.  Just call it now (but don't yet do any
 800      further initialization).  */
 801   if (instantiated_devices == 0)
 802     {
 803       r = cuInit (0);
 804       /* This is not an error: e.g. we may have CUDA libraries installed but
 805          no devices available.  */
 806       if (r != CUDA_SUCCESS)
 807         return 0;
 808     }
 809
 810   r = cuDeviceGetCount (&n);
 811   if (r!= CUDA_SUCCESS)
 812     GOMP_PLUGIN_fatal ("cuDeviceGetCount error: %s", cuda_error (r));
 813
 814   return n;
 815 }
 816
 817
 818 static void
 819 link_ptx (CUmodule *module, const char *ptx_code)
 820 {
 821   CUjit_option opts[7];
 822   void *optvals[7];
 823   float elapsed = 0.0;
 824 #define LOGSIZE 8192
 825   char elog[LOGSIZE];
 826   char ilog[LOGSIZE];
 827   unsigned long logsize = LOGSIZE;
 828   CUlinkState linkstate;
 829   CUresult r;
 830   void *linkout;
 831   size_t linkoutsize __attribute__ ((unused));
 832
 833   GOMP_PLUGIN_debug (0, "attempting to load:\n---\n%s\n---\n", ptx_code);
 834
 835   opts[0] = CU_JIT_WALL_TIME;
 836   optvals[0] = &elapsed;
 837
 838   opts[1] = CU_JIT_INFO_LOG_BUFFER;
 839   optvals[1] = &ilog[0];
 840
 841   opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
 842   optvals[2] = (void *) logsize;
 843
 844   opts[3] = CU_JIT_ERROR_LOG_BUFFER;
 845   optvals[3] = &elog[0];
 846
 847   opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
 848   optvals[4] = (void *) logsize;
 849
 850   opts[5] = CU_JIT_LOG_VERBOSE;
 851   optvals[5] = (void *) 1;
 852
 853   opts[6] = CU_JIT_TARGET;
 854   optvals[6] = (void *) CU_TARGET_COMPUTE_30;
 855
 856   r = cuLinkCreate (7, opts, optvals, &linkstate);
 857   if (r != CUDA_SUCCESS)
 858     GOMP_PLUGIN_fatal ("cuLinkCreate error: %s", cuda_error (r));
 859
 860   char *abort_ptx = ABORT_PTX;
 861   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, abort_ptx,
 862                      strlen (abort_ptx) + 1, 0, 0, 0, 0);
 863   if (r != CUDA_SUCCESS)
 864     {
 865       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 866       GOMP_PLUGIN_fatal ("cuLinkAddData (abort) error: %s", cuda_error (r));
 867     }
 868
 869   char *acc_on_device_ptx = ACC_ON_DEVICE_PTX;
 870   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, acc_on_device_ptx,
 871                      strlen (acc_on_device_ptx) + 1, 0, 0, 0, 0);
 872   if (r != CUDA_SUCCESS)
 873     {
 874       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 875       GOMP_PLUGIN_fatal ("cuLinkAddData (acc_on_device) error: %s",
 876                          cuda_error (r));
 877     }
 878
 879   char *goacc_internal_ptx = GOACC_INTERNAL_PTX;
 880   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx,
 881                      strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0);
 882   if (r != CUDA_SUCCESS)
 883     {
 884       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 885       GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s",
 886                          cuda_error (r));
 887     }
 888
 889   /* cuLinkAddData's 'data' argument erroneously omits the const qualifier.  */
 890   r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, (char *)ptx_code,
 891               strlen (ptx_code) + 1, 0, 0, 0, 0);
 892   if (r != CUDA_SUCCESS)
 893     {
 894       GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
 895       GOMP_PLUGIN_fatal ("cuLinkAddData (ptx_code) error: %s", cuda_error (r));
 896     }
 897
 898   r = cuLinkComplete (linkstate, &linkout, &linkoutsize);
 899   if (r != CUDA_SUCCESS)
 900     GOMP_PLUGIN_fatal ("cuLinkComplete error: %s", cuda_error (r));
 901
 902   GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
 903   GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
 904
 905   r = cuModuleLoadData (module, linkout);
 906   if (r != CUDA_SUCCESS)
 907     GOMP_PLUGIN_fatal ("cuModuleLoadData error: %s", cuda_error (r));
 908 }
 909
 910 static void
 911 event_gc (bool memmap_lockable)
 912 {
 913   struct ptx_event *ptx_event = ptx_events;
 914   struct nvptx_thread *nvthd = nvptx_thread ();
 915
 916   pthread_mutex_lock (&ptx_event_lock);
 917
 918   while (ptx_event != NULL)
 919     {
 920       CUresult r;
 921       struct ptx_event *e = ptx_event;
 922
 923       ptx_event = ptx_event->next;
 924
 925       if (e->ord != nvthd->ptx_dev->ord)
 926         continue;
 927
 928       r = cuEventQuery (*e->evt);
 929       if (r == CUDA_SUCCESS)
 930         {
 931           CUevent *te;
 932
 933           te = e->evt;
 934
 935           switch (e->type)
 936             {
 937             case PTX_EVT_MEM:
 938             case PTX_EVT_SYNC:
 939               break;
 940
 941             case PTX_EVT_KNL:
 942               map_pop (e->addr);
 943               break;
 944
 945             case PTX_EVT_ASYNC_CLEANUP:
 946               {
 947                 /* The function gomp_plugin_async_unmap_vars needs to claim the
 948                    memory-map splay tree lock for the current device, so we
 949                    can't call it when one of our callers has already claimed
 950                    the lock.  In that case, just delay the GC for this event
 951                    until later.  */
 952                 if (!memmap_lockable)
 953                   continue;
 954
 955                 GOMP_PLUGIN_async_unmap_vars (e->addr);
 956               }
 957               break;
 958             }
 959
 960           cuEventDestroy (*te);
 961           free ((void *)te);
 962
 963           if (ptx_events == e)
 964             ptx_events = ptx_events->next;
 965           else
 966             {
 967               struct ptx_event *e_ = ptx_events;
 968               while (e_->next != e)
 969                 e_ = e_->next;
 970               e_->next = e_->next->next;
 971             }
 972
 973           free (e);
 974         }
 975     }
 976
 977   pthread_mutex_unlock (&ptx_event_lock);
 978 }
 979
 980 static void
 981 event_add (enum ptx_event_type type, CUevent *e, void *h)
 982 {
 983   struct ptx_event *ptx_event;
 984   struct nvptx_thread *nvthd = nvptx_thread ();
 985
 986   assert (type == PTX_EVT_MEM || type == PTX_EVT_KNL || type == PTX_EVT_SYNC
 987           || type == PTX_EVT_ASYNC_CLEANUP);
 988
 989   ptx_event = GOMP_PLUGIN_malloc (sizeof (struct ptx_event));
 990   ptx_event->type = type;
 991   ptx_event->evt = e;
 992   ptx_event->addr = h;
 993   ptx_event->ord = nvthd->ptx_dev->ord;
 994
 995   pthread_mutex_lock (&ptx_event_lock);
 996
 997   ptx_event->next = ptx_events;
 998   ptx_events = ptx_event;
 999
1000   pthread_mutex_unlock (&ptx_event_lock);
1001 }
1002
1003 void
1004 nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
1005           size_t *sizes, unsigned short *kinds, int num_gangs, int num_workers,
1006           int vector_length, int async, void *targ_mem_desc)
1007 {
1008   struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
1009   CUfunction function;
1010   CUresult r;
1011   int i;
1012   struct ptx_stream *dev_str;
1013   void *kargs[1];
1014   void *hp, *dp;
1015   unsigned int nthreads_in_block;
1016   struct nvptx_thread *nvthd = nvptx_thread ();
1017   const char *maybe_abort_msg = "(perhaps abort was called)";
1018
1019   function = targ_fn->fn;
1020
1021   dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
1022   assert (dev_str == nvthd->current_stream);
1023
1024   /* This reserves a chunk of a pre-allocated page of memory mapped on both
1025      the host and the device. HP is a host pointer to the new chunk, and DP is
1026      the corresponding device pointer.  */
1027   map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
1028
1029   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
1030
1031   /* Copy the array of arguments to the mapped page.  */
1032   for (i = 0; i < mapnum; i++)
1033     ((void **) hp)[i] = devaddrs[i];
1034
1035   /* Copy the (device) pointers to arguments to the device (dp and hp might in
1036      fact have the same value on a unified-memory system).  */
1037   r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
1038   if (r != CUDA_SUCCESS)
1039     GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));
1040
1041   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch\n", __FUNCTION__, targ_fn->name);
1042
1043   // OpenACC            CUDA
1044   //
1045   // num_gangs          blocks
1046   // num_workers        warps (where a warp is equivalent to 32 threads)
1047   // vector length      threads
1048   //
1049
1050   /* The openacc vector_length clause 'determines the vector length to use for
1051      vector or SIMD operations'.  The question is how to map this to CUDA.
1052
1053      In CUDA, the warp size is the vector length of a CUDA device.  However, the
1054      CUDA interface abstracts away from that, and only shows us warp size
1055      indirectly in maximum number of threads per block, which is a product of
1056      warp size and the number of hyperthreads of a multiprocessor.
1057
1058      We choose to map openacc vector_length directly onto the number of threads
1059      in a block, in the x dimension.  This is reflected in gcc code generation
1060      that uses ThreadIdx.x to access vector elements.
1061
1062      Attempting to use an openacc vector_length of more than the maximum number
1063      of threads per block will result in a cuda error.  */
1064   nthreads_in_block = vector_length;
1065
1066   kargs[0] = &dp;
1067   r = cuLaunchKernel (function,
1068                       num_gangs, 1, 1,
1069                       nthreads_in_block, 1, 1,
1070                       0, dev_str->stream, kargs, 0);
1071   if (r != CUDA_SUCCESS)
1072     GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
1073
1074 #ifndef DISABLE_ASYNC
1075   if (async < acc_async_noval)
1076     {
1077       r = cuStreamSynchronize (dev_str->stream);
1078       if (r == CUDA_ERROR_LAUNCH_FAILED)
1079         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
1080                            maybe_abort_msg);
1081       else if (r != CUDA_SUCCESS)
1082         GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1083     }
1084   else
1085     {
1086       CUevent *e;
1087
1088       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1089
1090       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1091       if (r == CUDA_ERROR_LAUNCH_FAILED)
1092         GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
1093                            maybe_abort_msg);
1094       else if (r != CUDA_SUCCESS)
1095         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1096
1097       event_gc (true);
1098
1099       r = cuEventRecord (*e, dev_str->stream);
1100       if (r != CUDA_SUCCESS)
1101         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1102
1103       event_add (PTX_EVT_KNL, e, (void *)dev_str);
1104     }
1105 #else
1106   r = cuCtxSynchronize ();
1107   if (r == CUDA_ERROR_LAUNCH_FAILED)
1108     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
1109                        maybe_abort_msg);
1110   else if (r != CUDA_SUCCESS)
1111     GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
1112 #endif
1113
1114   GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
1115                      targ_fn->name);
1116
1117 #ifndef DISABLE_ASYNC
1118   if (async < acc_async_noval)
1119 #endif
1120     map_pop (dev_str);
1121 }
1122
1123 void * openacc_get_current_cuda_context (void);
1124
1125 static void *
1126 nvptx_alloc (size_t s)
1127 {
1128   CUdeviceptr d;
1129   CUresult r;
1130
1131   r = cuMemAlloc (&d, s);
1132   if (r == CUDA_ERROR_OUT_OF_MEMORY)
1133     return 0;
1134   if (r != CUDA_SUCCESS)
1135     GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
1136   return (void *)d;
1137 }
1138
1139 static void
1140 nvptx_free (void *p)
1141 {
1142   CUresult r;
1143   CUdeviceptr pb;
1144   size_t ps;
1145
1146   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)p);
1147   if (r != CUDA_SUCCESS)
1148     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1149
1150   if ((CUdeviceptr)p != pb)
1151     GOMP_PLUGIN_fatal ("invalid device address");
1152
1153   r = cuMemFree ((CUdeviceptr)p);
1154   if (r != CUDA_SUCCESS)
1155     GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
1156 }
1157
1158 static void *
1159 nvptx_host2dev (void *d, const void *h, size_t s)
1160 {
1161   CUresult r;
1162   CUdeviceptr pb;
1163   size_t ps;
1164   struct nvptx_thread *nvthd = nvptx_thread ();
1165
1166   if (!s)
1167     return 0;
1168
1169   if (!d)
1170     GOMP_PLUGIN_fatal ("invalid device address");
1171
1172   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1173   if (r != CUDA_SUCCESS)
1174     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1175
1176   if (!pb)
1177     GOMP_PLUGIN_fatal ("invalid device address");
1178
1179   if (!h)
1180     GOMP_PLUGIN_fatal ("invalid host address");
1181
1182   if (d == h)
1183     GOMP_PLUGIN_fatal ("invalid host or device address");
1184
1185   if ((void *)(d + s) > (void *)(pb + ps))
1186     GOMP_PLUGIN_fatal ("invalid size");
1187
1188 #ifndef DISABLE_ASYNC
1189   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1190     {
1191       CUevent *e;
1192
1193       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1194
1195       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1196       if (r != CUDA_SUCCESS)
1197         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1198
1199       event_gc (false);
1200
1201       r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s,
1202                              nvthd->current_stream->stream);
1203       if (r != CUDA_SUCCESS)
1204         GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r));
1205
1206       r = cuEventRecord (*e, nvthd->current_stream->stream);
1207       if (r != CUDA_SUCCESS)
1208         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1209
1210       event_add (PTX_EVT_MEM, e, (void *)h);
1211     }
1212   else
1213 #endif
1214     {
1215       r = cuMemcpyHtoD ((CUdeviceptr)d, h, s);
1216       if (r != CUDA_SUCCESS)
1217         GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
1218     }
1219
1220   return 0;
1221 }
1222
1223 static void *
1224 nvptx_dev2host (void *h, const void *d, size_t s)
1225 {
1226   CUresult r;
1227   CUdeviceptr pb;
1228   size_t ps;
1229   struct nvptx_thread *nvthd = nvptx_thread ();
1230
1231   if (!s)
1232     return 0;
1233
1234   if (!d)
1235     GOMP_PLUGIN_fatal ("invalid device address");
1236
1237   r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
1238   if (r != CUDA_SUCCESS)
1239     GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));
1240
1241   if (!pb)
1242     GOMP_PLUGIN_fatal ("invalid device address");
1243
1244   if (!h)
1245     GOMP_PLUGIN_fatal ("invalid host address");
1246
1247   if (d == h)
1248     GOMP_PLUGIN_fatal ("invalid host or device address");
1249
1250   if ((void *)(d + s) > (void *)(pb + ps))
1251     GOMP_PLUGIN_fatal ("invalid size");
1252
1253 #ifndef DISABLE_ASYNC
1254   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
1255     {
1256       CUevent *e;
1257
1258       e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1259
1260       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1261       if (r != CUDA_SUCCESS)
1262         GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));
1263
1264       event_gc (false);
1265
1266       r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
1267                              nvthd->current_stream->stream);
1268       if (r != CUDA_SUCCESS)
1269         GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));
1270
1271       r = cuEventRecord (*e, nvthd->current_stream->stream);
1272       if (r != CUDA_SUCCESS)
1273         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1274
1275       event_add (PTX_EVT_MEM, e, (void *)h);
1276     }
1277   else
1278 #endif
1279     {
1280       r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
1281       if (r != CUDA_SUCCESS)
1282         GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
1283     }
1284
1285   return 0;
1286 }
1287
1288 static void
1289 nvptx_set_async (int async)
1290 {
1291   struct nvptx_thread *nvthd = nvptx_thread ();
1292   nvthd->current_stream
1293     = select_stream_for_async (async, pthread_self (), true, NULL);
1294 }
1295
1296 static int
1297 nvptx_async_test (int async)
1298 {
1299   CUresult r;
1300   struct ptx_stream *s;
1301
1302   s = select_stream_for_async (async, pthread_self (), false, NULL);
1303
1304   if (!s)
1305     GOMP_PLUGIN_fatal ("unknown async %d", async);
1306
1307   r = cuStreamQuery (s->stream);
1308   if (r == CUDA_SUCCESS)
1309     {
1310       /* The oacc-parallel.c:goacc_wait function calls this hook to determine
1311          whether all work has completed on this stream, and if so omits the call
1312          to the wait hook.  If that happens, event_gc might not get called
1313          (which prevents variables from getting unmapped and their associated
1314          device storage freed), so call it here.  */
1315       event_gc (true);
1316       return 1;
1317     }
1318   else if (r == CUDA_ERROR_NOT_READY)
1319     return 0;
1320
1321   GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1322
1323   return 0;
1324 }
1325
1326 static int
1327 nvptx_async_test_all (void)
1328 {
1329   struct ptx_stream *s;
1330   pthread_t self = pthread_self ();
1331   struct nvptx_thread *nvthd = nvptx_thread ();
1332
1333   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1334
1335   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1336     {
1337       if ((s->multithreaded || pthread_equal (s->host_thread, self))
1338           && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY)
1339         {
1340           pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1341           return 0;
1342         }
1343     }
1344
1345   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1346
1347   event_gc (true);
1348
1349   return 1;
1350 }
1351
1352 static void
1353 nvptx_wait (int async)
1354 {
1355   CUresult r;
1356   struct ptx_stream *s;
1357
1358   s = select_stream_for_async (async, pthread_self (), false, NULL);
1359
1360   if (!s)
1361     GOMP_PLUGIN_fatal ("unknown async %d", async);
1362
1363   r = cuStreamSynchronize (s->stream);
1364   if (r != CUDA_SUCCESS)
1365     GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1366
1367   event_gc (true);
1368 }
1369
1370 static void
1371 nvptx_wait_async (int async1, int async2)
1372 {
1373   CUresult r;
1374   CUevent *e;
1375   struct ptx_stream *s1, *s2;
1376   pthread_t self = pthread_self ();
1377
1378   /* The stream that is waiting (rather than being waited for) doesn't
1379      necessarily have to exist already.  */
1380   s2 = select_stream_for_async (async2, self, true, NULL);
1381
1382   s1 = select_stream_for_async (async1, self, false, NULL);
1383   if (!s1)
1384     GOMP_PLUGIN_fatal ("invalid async 1\n");
1385
1386   if (s1 == s2)
1387     GOMP_PLUGIN_fatal ("identical parameters");
1388
1389   e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
1390
1391   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1392   if (r != CUDA_SUCCESS)
1393     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1394
1395   event_gc (true);
1396
1397   r = cuEventRecord (*e, s1->stream);
1398   if (r != CUDA_SUCCESS)
1399     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1400
1401   event_add (PTX_EVT_SYNC, e, NULL);
1402
1403   r = cuStreamWaitEvent (s2->stream, *e, 0);
1404   if (r != CUDA_SUCCESS)
1405     GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1406 }
1407
1408 static void
1409 nvptx_wait_all (void)
1410 {
1411   CUresult r;
1412   struct ptx_stream *s;
1413   pthread_t self = pthread_self ();
1414   struct nvptx_thread *nvthd = nvptx_thread ();
1415
1416   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1417
1418   /* Wait for active streams initiated by this thread (or by multiple threads)
1419      to complete.  */
1420   for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next)
1421     {
1422       if (s->multithreaded || pthread_equal (s->host_thread, self))
1423         {
1424           r = cuStreamQuery (s->stream);
1425           if (r == CUDA_SUCCESS)
1426             continue;
1427           else if (r != CUDA_ERROR_NOT_READY)
1428             GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
1429
1430           r = cuStreamSynchronize (s->stream);
1431           if (r != CUDA_SUCCESS)
1432             GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
1433         }
1434     }
1435
1436   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1437
1438   event_gc (true);
1439 }
1440
1441 static void
1442 nvptx_wait_all_async (int async)
1443 {
1444   CUresult r;
1445   struct ptx_stream *waiting_stream, *other_stream;
1446   CUevent *e;
1447   struct nvptx_thread *nvthd = nvptx_thread ();
1448   pthread_t self = pthread_self ();
1449
1450   /* The stream doing the waiting.  This could be the first mention of the
1451      stream, so create it if necessary.  */
1452   waiting_stream
1453     = select_stream_for_async (async, pthread_self (), true, NULL);
1454
1455   /* Launches on the null stream already block on other streams in the
1456      context.  */
1457   if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream)
1458     return;
1459
1460   event_gc (true);
1461
1462   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1463
1464   for (other_stream = nvthd->ptx_dev->active_streams;
1465        other_stream != NULL;
1466        other_stream = other_stream->next)
1467     {
1468       if (!other_stream->multithreaded
1469           && !pthread_equal (other_stream->host_thread, self))
1470         continue;
1471
1472       e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1473
1474       r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1475       if (r != CUDA_SUCCESS)
1476         GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1477
1478       /* Record an event on the waited-for stream.  */
1479       r = cuEventRecord (*e, other_stream->stream);
1480       if (r != CUDA_SUCCESS)
1481         GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1482
1483       event_add (PTX_EVT_SYNC, e, NULL);
1484
1485       r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
1486       if (r != CUDA_SUCCESS)
1487         GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r));
1488    }
1489
1490   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1491 }
1492
1493 static void *
1494 nvptx_get_current_cuda_device (void)
1495 {
1496   struct nvptx_thread *nvthd = nvptx_thread ();
1497
1498   if (!nvthd || !nvthd->ptx_dev)
1499     return NULL;
1500
1501   return &nvthd->ptx_dev->dev;
1502 }
1503
1504 static void *
1505 nvptx_get_current_cuda_context (void)
1506 {
1507   struct nvptx_thread *nvthd = nvptx_thread ();
1508
1509   if (!nvthd || !nvthd->ptx_dev)
1510     return NULL;
1511
1512   return nvthd->ptx_dev->ctx;
1513 }
1514
1515 static void *
1516 nvptx_get_cuda_stream (int async)
1517 {
1518   struct ptx_stream *s;
1519   struct nvptx_thread *nvthd = nvptx_thread ();
1520
1521   if (!nvthd || !nvthd->ptx_dev)
1522     return NULL;
1523
1524   s = select_stream_for_async (async, pthread_self (), false, NULL);
1525
1526   return s ? s->stream : NULL;
1527 }
1528
1529 static int
1530 nvptx_set_cuda_stream (int async, void *stream)
1531 {
1532   struct ptx_stream *oldstream;
1533   pthread_t self = pthread_self ();
1534   struct nvptx_thread *nvthd = nvptx_thread ();
1535
1536   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
1537
1538   if (async < 0)
1539     GOMP_PLUGIN_fatal ("bad async %d", async);
1540
1541   /* We have a list of active streams and an array mapping async values to
1542      entries of that list.  We need to take "ownership" of the passed-in stream,
1543      and add it to our list, removing the previous entry also (if there was one)
1544      in order to prevent resource leaks.  Note the potential for surprise
1545      here: maybe we should keep track of passed-in streams and leave it up to
1546      the user to tidy those up, but that doesn't work for stream handles
1547      returned from acc_get_cuda_stream above...  */
1548
1549   oldstream = select_stream_for_async (async, self, false, NULL);
1550
1551   if (oldstream)
1552     {
1553       if (nvthd->ptx_dev->active_streams == oldstream)
1554         nvthd->ptx_dev->active_streams = nvthd->ptx_dev->active_streams->next;
1555       else
1556         {
1557           struct ptx_stream *s = nvthd->ptx_dev->active_streams;
1558           while (s->next != oldstream)
1559             s = s->next;
1560           s->next = s->next->next;
1561         }
1562
1563       cuStreamDestroy (oldstream->stream);
1564       map_fini (oldstream);
1565       free (oldstream);
1566     }
1567
1568   pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock);
1569
1570   (void) select_stream_for_async (async, self, true, (CUstream) stream);
1571
1572   return 1;
1573 }
1574
1575 /* Plugin entry points.  */
1576
1577 const char *
1578 GOMP_OFFLOAD_get_name (void)
1579 {
1580   return "nvptx";
1581 }
1582
1583 unsigned int
1584 GOMP_OFFLOAD_get_caps (void)
1585 {
1586   return GOMP_OFFLOAD_CAP_OPENACC_200;
1587 }
1588
1589 int
1590 GOMP_OFFLOAD_get_type (void)
1591 {
1592   return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
1593 }
1594
1595 int
1596 GOMP_OFFLOAD_get_num_devices (void)
1597 {
1598   return nvptx_get_num_devices ();
1599 }
1600
1601 void
1602 GOMP_OFFLOAD_init_device (int n)
1603 {
1604   pthread_mutex_lock (&ptx_dev_lock);
1605
1606   if (!nvptx_init () || ptx_devices[n] != NULL)
1607     {
1608       pthread_mutex_unlock (&ptx_dev_lock);
1609       return;
1610     }
1611
1612   ptx_devices[n] = nvptx_open_device (n);
1613   instantiated_devices++;
1614
1615   pthread_mutex_unlock (&ptx_dev_lock);
1616 }
1617
1618 void
1619 GOMP_OFFLOAD_fini_device (int n)
1620 {
1621   pthread_mutex_lock (&ptx_dev_lock);
1622
1623   if (ptx_devices[n] != NULL)
1624     {
1625       nvptx_attach_host_thread_to_device (n);
1626       nvptx_close_device (ptx_devices[n]);
1627       ptx_devices[n] = NULL;
1628       instantiated_devices--;
1629     }
1630
1631   pthread_mutex_unlock (&ptx_dev_lock);
1632 }
1633
1634 /* Data emitted by mkoffload.  */
1635
1636 typedef struct nvptx_tdata
1637 {
1638   const char *ptx_src;
1639
1640   const char *const *var_names;
1641   size_t var_num;
1642
1643   const char *const *fn_names;
1644   size_t fn_num;
1645 } nvptx_tdata_t;
1646
1647 /* Load the (partial) program described by TARGET_DATA to device
1648    number ORD.  Allocate and return TARGET_TABLE.  */
1649
1650 int
1651 GOMP_OFFLOAD_load_image (int ord, const void *target_data,
1652                          struct addr_pair **target_table)
1653 {
1654   CUmodule module;
1655   const char *const *fn_names, *const *var_names;
1656   unsigned int fn_entries, var_entries, i, j;
1657   CUresult r;
1658   struct targ_fn_descriptor *targ_fns;
1659   struct addr_pair *targ_tbl;
1660   const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
1661   struct ptx_image_data *new_image;
1662   struct ptx_device *dev;
1663
1664   GOMP_OFFLOAD_init_device (ord);
1665
1666   dev = ptx_devices[ord];
1667
1668   nvptx_attach_host_thread_to_device (ord);
1669
1670   link_ptx (&module, img_header->ptx_src);
1671
1672   /* The mkoffload utility emits a struct of pointers/integers at the
1673      start of each offload image.  The array of kernel names and the
1674      functions addresses form a one-to-one correspondence.  */
1675
1676   var_entries = img_header->var_num;
1677   var_names = img_header->var_names;
1678   fn_entries = img_header->fn_num;
1679   fn_names = img_header->fn_names;
1680
1681   targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
1682                                  * (fn_entries + var_entries));
1683   targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
1684                                  * fn_entries);
1685
1686   *target_table = targ_tbl;
1687
1688   new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
1689   new_image->target_data = target_data;
1690   new_image->module = module;
1691   new_image->fns = targ_fns;
1692
1693   pthread_mutex_lock (&dev->image_lock);
1694   new_image->next = dev->images;
1695   dev->images = new_image;
1696   pthread_mutex_unlock (&dev->image_lock);
1697
1698   for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
1699     {
1700       CUfunction function;
1701
1702       r = cuModuleGetFunction (&function, module, fn_names[i]);
1703       if (r != CUDA_SUCCESS)
1704         GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
1705
1706       targ_fns->fn = function;
1707       targ_fns->name = (const char *) fn_names[i];
1708
1709       targ_tbl->start = (uintptr_t) targ_fns;
1710       targ_tbl->end = targ_tbl->start + 1;
1711     }
1712
1713   for (j = 0; j < var_entries; j++, targ_tbl++)
1714     {
1715       CUdeviceptr var;
1716       size_t bytes;
1717
1718       r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
1719       if (r != CUDA_SUCCESS)
1720         GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
1721
1722       targ_tbl->start = (uintptr_t) var;
1723       targ_tbl->end = targ_tbl->start + bytes;
1724     }
1725
1726   return fn_entries + var_entries;
1727 }
1728
1729 /* Unload the program described by TARGET_DATA.  DEV_DATA is the
1730    function descriptors allocated by G_O_load_image.  */
1731
1732 void
1733 GOMP_OFFLOAD_unload_image (int ord, const void *target_data)
1734 {
1735   struct ptx_image_data *image, **prev_p;
1736   struct ptx_device *dev = ptx_devices[ord];
1737
1738   pthread_mutex_lock (&dev->image_lock);
1739   for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
1740     if (image->target_data == target_data)
1741       {
1742         *prev_p = image->next;
1743         cuModuleUnload (image->module);
1744         free (image->fns);
1745         free (image);
1746         break;
1747       }
1748   pthread_mutex_unlock (&dev->image_lock);
1749 }
1750
1751 void *
1752 GOMP_OFFLOAD_alloc (int ord, size_t size)
1753 {
1754   nvptx_attach_host_thread_to_device (ord);
1755   return nvptx_alloc (size);
1756 }
1757
1758 void
1759 GOMP_OFFLOAD_free (int ord, void *ptr)
1760 {
1761   nvptx_attach_host_thread_to_device (ord);
1762   nvptx_free (ptr);
1763 }
1764
1765 void *
1766 GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
1767 {
1768   nvptx_attach_host_thread_to_device (ord);
1769   return nvptx_dev2host (dst, src, n);
1770 }
1771
1772 void *
1773 GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
1774 {
1775   nvptx_attach_host_thread_to_device (ord);
1776   return nvptx_host2dev (dst, src, n);
1777 }
1778
1779 void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
1780
1781 void
1782 GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
1783                                void **hostaddrs, void **devaddrs, size_t *sizes,
1784                                unsigned short *kinds, int num_gangs,
1785                                int num_workers, int vector_length, int async,
1786                                void *targ_mem_desc)
1787 {
1788   nvptx_exec (fn, mapnum, hostaddrs, devaddrs, sizes, kinds, num_gangs,
1789             num_workers, vector_length, async, targ_mem_desc);
1790 }
1791
1792 void
1793 GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
1794 {
1795   CUevent *e;
1796   CUresult r;
1797   struct nvptx_thread *nvthd = nvptx_thread ();
1798
1799   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
1800
1801   r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
1802   if (r != CUDA_SUCCESS)
1803     GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));
1804
1805   r = cuEventRecord (*e, nvthd->current_stream->stream);
1806   if (r != CUDA_SUCCESS)
1807     GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
1808
1809   event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
1810 }
1811
1812 int
1813 GOMP_OFFLOAD_openacc_async_test (int async)
1814 {
1815   return nvptx_async_test (async);
1816 }
1817
1818 int
1819 GOMP_OFFLOAD_openacc_async_test_all (void)
1820 {
1821   return nvptx_async_test_all ();
1822 }
1823
1824 void
1825 GOMP_OFFLOAD_openacc_async_wait (int async)
1826 {
1827   nvptx_wait (async);
1828 }
1829
1830 void
1831 GOMP_OFFLOAD_openacc_async_wait_async (int async1, int async2)
1832 {
1833   nvptx_wait_async (async1, async2);
1834 }
1835
1836 void
1837 GOMP_OFFLOAD_openacc_async_wait_all (void)
1838 {
1839   nvptx_wait_all ();
1840 }
1841
1842 void
1843 GOMP_OFFLOAD_openacc_async_wait_all_async (int async)
1844 {
1845   nvptx_wait_all_async (async);
1846 }
1847
1848 void
1849 GOMP_OFFLOAD_openacc_async_set_async (int async)
1850 {
1851   nvptx_set_async (async);
1852 }
1853
1854 void *
1855 GOMP_OFFLOAD_openacc_create_thread_data (int ord)
1856 {
1857   struct ptx_device *ptx_dev;
1858   struct nvptx_thread *nvthd
1859     = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
1860   CUresult r;
1861   CUcontext thd_ctx;
1862
1863   ptx_dev = ptx_devices[ord];
1864
1865   assert (ptx_dev);
1866
1867   r = cuCtxGetCurrent (&thd_ctx);
1868   if (r != CUDA_SUCCESS)
1869     GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));
1870
1871   assert (ptx_dev->ctx);
1872
1873   if (!thd_ctx)
1874     {
1875       r = cuCtxPushCurrent (ptx_dev->ctx);
1876       if (r != CUDA_SUCCESS)
1877         GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
1878     }
1879
1880   nvthd->current_stream = ptx_dev->null_stream;
1881   nvthd->ptx_dev = ptx_dev;
1882
1883   return (void *) nvthd;
1884 }
1885
1886 void
1887 GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
1888 {
1889   free (data);
1890 }
1891
1892 void *
1893 GOMP_OFFLOAD_openacc_get_current_cuda_device (void)
1894 {
1895   return nvptx_get_current_cuda_device ();
1896 }
1897
1898 void *
1899 GOMP_OFFLOAD_openacc_get_current_cuda_context (void)
1900 {
1901   return nvptx_get_current_cuda_context ();
1902 }
1903
1904 /* NOTE: This returns a CUstream, not a ptx_stream pointer.  */
1905
1906 void *
1907 GOMP_OFFLOAD_openacc_get_cuda_stream (int async)
1908 {
1909   return nvptx_get_cuda_stream (async);
1910 }
1911
1912 /* NOTE: This takes a CUstream, not a ptx_stream pointer.  */
1913
1914 int
1915 GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
1916 {
1917   return nvptx_set_cuda_stream (async, stream);
1918 }