src/intel/vulkan/anv_queue.c

   1 /*
   2  * Copyright © 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * This file implements VkQueue, VkFence, and VkSemaphore
  26  */
  27
  28 #include "anv_private.h"
  29 #include "util/vk_util.h"
  30
  31 #include "genxml/gen7_pack.h"
  32
  33 VkResult
  34 anv_device_execbuf(struct anv_device *device,
  35                    struct drm_i915_gem_execbuffer2 *execbuf,
  36                    struct anv_bo **execbuf_bos)
  37 {
  38    int ret = anv_gem_execbuffer(device, execbuf);
  39    if (ret != 0) {
  40       /* We don't know the real error. */
  41       device->lost = true;
  42       return vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m");
  43    }
  44
  45    struct drm_i915_gem_exec_object2 *objects =
  46       (void *)(uintptr_t)execbuf->buffers_ptr;
  47    for (uint32_t k = 0; k < execbuf->buffer_count; k++)
  48       execbuf_bos[k]->offset = objects[k].offset;
  49
  50    return VK_SUCCESS;
  51 }
  52
  53 VkResult
  54 anv_device_submit_simple_batch(struct anv_device *device,
  55                                struct anv_batch *batch)
  56 {
  57    struct drm_i915_gem_execbuffer2 execbuf;
  58    struct drm_i915_gem_exec_object2 exec2_objects[1];
  59    struct anv_bo bo, *exec_bos[1];
  60    VkResult result = VK_SUCCESS;
  61    uint32_t size;
  62
  63    /* Kernel driver requires 8 byte aligned batch length */
  64    size = align_u32(batch->next - batch->start, 8);
  65    result = anv_bo_pool_alloc(&device->batch_bo_pool, &bo, size);
  66    if (result != VK_SUCCESS)
  67       return result;
  68
  69    memcpy(bo.map, batch->start, size);
  70    if (!device->info.has_llc)
  71       anv_flush_range(bo.map, size);
  72
  73    exec_bos[0] = &bo;
  74    exec2_objects[0].handle = bo.gem_handle;
  75    exec2_objects[0].relocation_count = 0;
  76    exec2_objects[0].relocs_ptr = 0;
  77    exec2_objects[0].alignment = 0;
  78    exec2_objects[0].offset = bo.offset;
  79    exec2_objects[0].flags = 0;
  80    exec2_objects[0].rsvd1 = 0;
  81    exec2_objects[0].rsvd2 = 0;
  82
  83    execbuf.buffers_ptr = (uintptr_t) exec2_objects;
  84    execbuf.buffer_count = 1;
  85    execbuf.batch_start_offset = 0;
  86    execbuf.batch_len = size;
  87    execbuf.cliprects_ptr = 0;
  88    execbuf.num_cliprects = 0;
  89    execbuf.DR1 = 0;
  90    execbuf.DR4 = 0;
  91
  92    execbuf.flags =
  93       I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
  94    execbuf.rsvd1 = device->context_id;
  95    execbuf.rsvd2 = 0;
  96
  97    result = anv_device_execbuf(device, &execbuf, exec_bos);
  98    if (result != VK_SUCCESS)
  99       goto fail;
 100
 101    result = anv_device_wait(device, &bo, INT64_MAX);
 102
 103  fail:
 104    anv_bo_pool_free(&device->batch_bo_pool, &bo);
 105
 106    return result;
 107 }
 108
 109 VkResult anv_QueueSubmit(
 110     VkQueue                                     _queue,
 111     uint32_t                                    submitCount,
 112     const VkSubmitInfo*                         pSubmits,
 113     VkFence                                     _fence)
 114 {
 115    ANV_FROM_HANDLE(anv_queue, queue, _queue);
 116    ANV_FROM_HANDLE(anv_fence, fence, _fence);
 117    struct anv_device *device = queue->device;
 118
 119    /* Query for device status prior to submitting.  Technically, we don't need
 120     * to do this.  However, if we have a client that's submitting piles of
 121     * garbage, we would rather break as early as possible to keep the GPU
 122     * hanging contained.  If we don't check here, we'll either be waiting for
 123     * the kernel to kick us or we'll have to wait until the client waits on a
 124     * fence before we actually know whether or not we've hung.
 125     */
 126    VkResult result = anv_device_query_status(device);
 127    if (result != VK_SUCCESS)
 128       return result;
 129
 130    /* We lock around QueueSubmit for three main reasons:
 131     *
 132     *  1) When a block pool is resized, we create a new gem handle with a
 133     *     different size and, in the case of surface states, possibly a
 134     *     different center offset but we re-use the same anv_bo struct when
 135     *     we do so.  If this happens in the middle of setting up an execbuf,
 136     *     we could end up with our list of BOs out of sync with our list of
 137     *     gem handles.
 138     *
 139     *  2) The algorithm we use for building the list of unique buffers isn't
 140     *     thread-safe.  While the client is supposed to syncronize around
 141     *     QueueSubmit, this would be extremely difficult to debug if it ever
 142     *     came up in the wild due to a broken app.  It's better to play it
 143     *     safe and just lock around QueueSubmit.
 144     *
 145     *  3)  The anv_cmd_buffer_execbuf function may perform relocations in
 146     *      userspace.  Due to the fact that the surface state buffer is shared
 147     *      between batches, we can't afford to have that happen from multiple
 148     *      threads at the same time.  Even though the user is supposed to
 149     *      ensure this doesn't happen, we play it safe as in (2) above.
 150     *
 151     * Since the only other things that ever take the device lock such as block
 152     * pool resize only rarely happen, this will almost never be contended so
 153     * taking a lock isn't really an expensive operation in this case.
 154     */
 155    pthread_mutex_lock(&device->mutex);
 156
 157    for (uint32_t i = 0; i < submitCount; i++) {
 158       for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
 159          ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer,
 160                          pSubmits[i].pCommandBuffers[j]);
 161          assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 162          assert(!anv_batch_has_error(&cmd_buffer->batch));
 163
 164          result = anv_cmd_buffer_execbuf(device, cmd_buffer);
 165          if (result != VK_SUCCESS)
 166             goto out;
 167       }
 168    }
 169
 170    if (fence) {
 171       struct anv_bo *fence_bo = &fence->bo;
 172       result = anv_device_execbuf(device, &fence->execbuf, &fence_bo);
 173       if (result != VK_SUCCESS)
 174          goto out;
 175
 176       /* Update the fence and wake up any waiters */
 177       assert(fence->state == ANV_FENCE_STATE_RESET);
 178       fence->state = ANV_FENCE_STATE_SUBMITTED;
 179       pthread_cond_broadcast(&device->queue_submit);
 180    }
 181
 182 out:
 183    if (result != VK_SUCCESS) {
 184       /* In the case that something has gone wrong we may end up with an
 185        * inconsistent state from which it may not be trivial to recover.
 186        * For example, we might have computed address relocations and
 187        * any future attempt to re-submit this job will need to know about
 188        * this and avoid computing relocation addresses again.
 189        *
 190        * To avoid this sort of issues, we assume that if something was
 191        * wrong during submission we must already be in a really bad situation
 192        * anyway (such us being out of memory) and return
 193        * VK_ERROR_DEVICE_LOST to ensure that clients do not attempt to
 194        * submit the same job again to this device.
 195        */
 196       result = VK_ERROR_DEVICE_LOST;
 197       device->lost = true;
 198
 199       /* If we return VK_ERROR_DEVICE LOST here, we need to ensure that
 200        * vkWaitForFences() and vkGetFenceStatus() return a valid result
 201        * (VK_SUCCESS or VK_ERROR_DEVICE_LOST) in a finite amount of time.
 202        * Setting the fence status to SIGNALED ensures this will happen in
 203        * any case.
 204        */
 205       if (fence)
 206          fence->state = ANV_FENCE_STATE_SIGNALED;
 207    }
 208
 209    pthread_mutex_unlock(&device->mutex);
 210
 211    return result;
 212 }
 213
 214 VkResult anv_QueueWaitIdle(
 215     VkQueue                                     _queue)
 216 {
 217    ANV_FROM_HANDLE(anv_queue, queue, _queue);
 218
 219    return anv_DeviceWaitIdle(anv_device_to_handle(queue->device));
 220 }
 221
 222 VkResult anv_CreateFence(
 223     VkDevice                                    _device,
 224     const VkFenceCreateInfo*                    pCreateInfo,
 225     const VkAllocationCallbacks*                pAllocator,
 226     VkFence*                                    pFence)
 227 {
 228    ANV_FROM_HANDLE(anv_device, device, _device);
 229    struct anv_bo fence_bo;
 230    struct anv_fence *fence;
 231    struct anv_batch batch;
 232    VkResult result;
 233
 234    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO);
 235
 236    result = anv_bo_pool_alloc(&device->batch_bo_pool, &fence_bo, 4096);
 237    if (result != VK_SUCCESS)
 238       return result;
 239
 240    /* Fences are small.  Just store the CPU data structure in the BO. */
 241    fence = fence_bo.map;
 242    fence->bo = fence_bo;
 243
 244    /* Place the batch after the CPU data but on its own cache line. */
 245    const uint32_t batch_offset = align_u32(sizeof(*fence), CACHELINE_SIZE);
 246    batch.next = batch.start = fence->bo.map + batch_offset;
 247    batch.end = fence->bo.map + fence->bo.size;
 248    anv_batch_emit(&batch, GEN7_MI_BATCH_BUFFER_END, bbe);
 249    anv_batch_emit(&batch, GEN7_MI_NOOP, noop);
 250
 251    if (!device->info.has_llc) {
 252       assert(((uintptr_t) batch.start & CACHELINE_MASK) == 0);
 253       assert(batch.next - batch.start <= CACHELINE_SIZE);
 254       __builtin_ia32_mfence();
 255       __builtin_ia32_clflush(batch.start);
 256    }
 257
 258    fence->exec2_objects[0].handle = fence->bo.gem_handle;
 259    fence->exec2_objects[0].relocation_count = 0;
 260    fence->exec2_objects[0].relocs_ptr = 0;
 261    fence->exec2_objects[0].alignment = 0;
 262    fence->exec2_objects[0].offset = fence->bo.offset;
 263    fence->exec2_objects[0].flags = 0;
 264    fence->exec2_objects[0].rsvd1 = 0;
 265    fence->exec2_objects[0].rsvd2 = 0;
 266
 267    fence->execbuf.buffers_ptr = (uintptr_t) fence->exec2_objects;
 268    fence->execbuf.buffer_count = 1;
 269    fence->execbuf.batch_start_offset = batch.start - fence->bo.map;
 270    fence->execbuf.batch_len = batch.next - batch.start;
 271    fence->execbuf.cliprects_ptr = 0;
 272    fence->execbuf.num_cliprects = 0;
 273    fence->execbuf.DR1 = 0;
 274    fence->execbuf.DR4 = 0;
 275
 276    fence->execbuf.flags =
 277       I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
 278    fence->execbuf.rsvd1 = device->context_id;
 279    fence->execbuf.rsvd2 = 0;
 280
 281    if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) {
 282       fence->state = ANV_FENCE_STATE_SIGNALED;
 283    } else {
 284       fence->state = ANV_FENCE_STATE_RESET;
 285    }
 286
 287    *pFence = anv_fence_to_handle(fence);
 288
 289    return VK_SUCCESS;
 290 }
 291
 292 void anv_DestroyFence(
 293     VkDevice                                    _device,
 294     VkFence                                     _fence,
 295     const VkAllocationCallbacks*                pAllocator)
 296 {
 297    ANV_FROM_HANDLE(anv_device, device, _device);
 298    ANV_FROM_HANDLE(anv_fence, fence, _fence);
 299
 300    if (!fence)
 301       return;
 302
 303    assert(fence->bo.map == fence);
 304    anv_bo_pool_free(&device->batch_bo_pool, &fence->bo);
 305 }
 306
 307 VkResult anv_ResetFences(
 308     VkDevice                                    _device,
 309     uint32_t                                    fenceCount,
 310     const VkFence*                              pFences)
 311 {
 312    for (uint32_t i = 0; i < fenceCount; i++) {
 313       ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
 314       fence->state = ANV_FENCE_STATE_RESET;
 315    }
 316
 317    return VK_SUCCESS;
 318 }
 319
 320 VkResult anv_GetFenceStatus(
 321     VkDevice                                    _device,
 322     VkFence                                     _fence)
 323 {
 324    ANV_FROM_HANDLE(anv_device, device, _device);
 325    ANV_FROM_HANDLE(anv_fence, fence, _fence);
 326
 327    if (unlikely(device->lost))
 328       return VK_ERROR_DEVICE_LOST;
 329
 330    switch (fence->state) {
 331    case ANV_FENCE_STATE_RESET:
 332       /* If it hasn't even been sent off to the GPU yet, it's not ready */
 333       return VK_NOT_READY;
 334
 335    case ANV_FENCE_STATE_SIGNALED:
 336       /* It's been signaled, return success */
 337       return VK_SUCCESS;
 338
 339    case ANV_FENCE_STATE_SUBMITTED: {
 340       VkResult result = anv_device_bo_busy(device, &fence->bo);
 341       if (result == VK_SUCCESS) {
 342          fence->state = ANV_FENCE_STATE_SIGNALED;
 343          return VK_SUCCESS;
 344       } else {
 345          return result;
 346       }
 347    }
 348    default:
 349       unreachable("Invalid fence status");
 350    }
 351 }
 352
 353 #define NSEC_PER_SEC 1000000000
 354 #define INT_TYPE_MAX(type) ((1ull << (sizeof(type) * 8 - 1)) - 1)
 355
 356 VkResult anv_WaitForFences(
 357     VkDevice                                    _device,
 358     uint32_t                                    fenceCount,
 359     const VkFence*                              pFences,
 360     VkBool32                                    waitAll,
 361     uint64_t                                    _timeout)
 362 {
 363    ANV_FROM_HANDLE(anv_device, device, _device);
 364    int ret;
 365
 366    if (unlikely(device->lost))
 367       return VK_ERROR_DEVICE_LOST;
 368
 369    /* DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is supposed
 370     * to block indefinitely timeouts <= 0.  Unfortunately, this was broken
 371     * for a couple of kernel releases.  Since there's no way to know
 372     * whether or not the kernel we're using is one of the broken ones, the
 373     * best we can do is to clamp the timeout to INT64_MAX.  This limits the
 374     * maximum timeout from 584 years to 292 years - likely not a big deal.
 375     */
 376    int64_t timeout = MIN2(_timeout, INT64_MAX);
 377
 378    VkResult result = VK_SUCCESS;
 379    uint32_t pending_fences = fenceCount;
 380    while (pending_fences) {
 381       pending_fences = 0;
 382       bool signaled_fences = false;
 383       for (uint32_t i = 0; i < fenceCount; i++) {
 384          ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
 385          switch (fence->state) {
 386          case ANV_FENCE_STATE_RESET:
 387             /* This fence hasn't been submitted yet, we'll catch it the next
 388              * time around.  Yes, this may mean we dead-loop but, short of
 389              * lots of locking and a condition variable, there's not much that
 390              * we can do about that.
 391              */
 392             pending_fences++;
 393             continue;
 394
 395          case ANV_FENCE_STATE_SIGNALED:
 396             /* This fence is not pending.  If waitAll isn't set, we can return
 397              * early.  Otherwise, we have to keep going.
 398              */
 399             if (!waitAll) {
 400                result = VK_SUCCESS;
 401                goto done;
 402             }
 403             continue;
 404
 405          case ANV_FENCE_STATE_SUBMITTED:
 406             /* These are the fences we really care about.  Go ahead and wait
 407              * on it until we hit a timeout.
 408              */
 409             result = anv_device_wait(device, &fence->bo, timeout);
 410             switch (result) {
 411             case VK_SUCCESS:
 412                fence->state = ANV_FENCE_STATE_SIGNALED;
 413                signaled_fences = true;
 414                if (!waitAll)
 415                   goto done;
 416                break;
 417
 418             case VK_TIMEOUT:
 419                goto done;
 420
 421             default:
 422                return result;
 423             }
 424          }
 425       }
 426
 427       if (pending_fences && !signaled_fences) {
 428          /* If we've hit this then someone decided to vkWaitForFences before
 429           * they've actually submitted any of them to a queue.  This is a
 430           * fairly pessimal case, so it's ok to lock here and use a standard
 431           * pthreads condition variable.
 432           */
 433          pthread_mutex_lock(&device->mutex);
 434
 435          /* It's possible that some of the fences have changed state since the
 436           * last time we checked.  Now that we have the lock, check for
 437           * pending fences again and don't wait if it's changed.
 438           */
 439          uint32_t now_pending_fences = 0;
 440          for (uint32_t i = 0; i < fenceCount; i++) {
 441             ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
 442             if (fence->state == ANV_FENCE_STATE_RESET)
 443                now_pending_fences++;
 444          }
 445          assert(now_pending_fences <= pending_fences);
 446
 447          if (now_pending_fences == pending_fences) {
 448             struct timespec before;
 449             clock_gettime(CLOCK_MONOTONIC, &before);
 450
 451             uint32_t abs_nsec = before.tv_nsec + timeout % NSEC_PER_SEC;
 452             uint64_t abs_sec = before.tv_sec + (abs_nsec / NSEC_PER_SEC) +
 453                                (timeout / NSEC_PER_SEC);
 454             abs_nsec %= NSEC_PER_SEC;
 455
 456             /* Avoid roll-over in tv_sec on 32-bit systems if the user
 457              * provided timeout is UINT64_MAX
 458              */
 459             struct timespec abstime;
 460             abstime.tv_nsec = abs_nsec;
 461             abstime.tv_sec = MIN2(abs_sec, INT_TYPE_MAX(abstime.tv_sec));
 462
 463             ret = pthread_cond_timedwait(&device->queue_submit,
 464                                          &device->mutex, &abstime);
 465             assert(ret != EINVAL);
 466
 467             struct timespec after;
 468             clock_gettime(CLOCK_MONOTONIC, &after);
 469             uint64_t time_elapsed =
 470                ((uint64_t)after.tv_sec * NSEC_PER_SEC + after.tv_nsec) -
 471                ((uint64_t)before.tv_sec * NSEC_PER_SEC + before.tv_nsec);
 472
 473             if (time_elapsed >= timeout) {
 474                pthread_mutex_unlock(&device->mutex);
 475                result = VK_TIMEOUT;
 476                goto done;
 477             }
 478
 479             timeout -= time_elapsed;
 480          }
 481
 482          pthread_mutex_unlock(&device->mutex);
 483       }
 484    }
 485
 486 done:
 487    if (unlikely(device->lost))
 488       return VK_ERROR_DEVICE_LOST;
 489
 490    return result;
 491 }
 492
 493 // Queue semaphore functions
 494
 495 VkResult anv_CreateSemaphore(
 496     VkDevice                                    _device,
 497     const VkSemaphoreCreateInfo*                pCreateInfo,
 498     const VkAllocationCallbacks*                pAllocator,
 499     VkSemaphore*                                pSemaphore)
 500 {
 501    ANV_FROM_HANDLE(anv_device, device, _device);
 502    struct anv_semaphore *semaphore;
 503
 504    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO);
 505
 506    semaphore = vk_alloc2(&device->alloc, pAllocator, sizeof(*semaphore), 8,
 507                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 508    if (semaphore == NULL)
 509       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 510
 511    /* The DRM execbuffer ioctl always execute in-oder, even between
 512     * different rings. As such, a dummy no-op semaphore is a perfectly
 513     * valid implementation.
 514     */
 515    semaphore->permanent.type = ANV_SEMAPHORE_TYPE_DUMMY;
 516    semaphore->temporary.type = ANV_SEMAPHORE_TYPE_NONE;
 517
 518    *pSemaphore = anv_semaphore_to_handle(semaphore);
 519
 520    return VK_SUCCESS;
 521 }
 522
 523 void anv_DestroySemaphore(
 524     VkDevice                                    _device,
 525     VkSemaphore                                 _semaphore,
 526     const VkAllocationCallbacks*                pAllocator)
 527 {
 528    ANV_FROM_HANDLE(anv_device, device, _device);
 529    ANV_FROM_HANDLE(anv_semaphore, semaphore, _semaphore);
 530
 531    if (semaphore == NULL)
 532       return;
 533
 534    vk_free2(&device->alloc, pAllocator, semaphore);
 535 }