src/gallium/winsys/amdgpu/drm/amdgpu_bo.c

   1 /*
   2  * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
   3  * Copyright © 2015 Advanced Micro Devices, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
  18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * The above copyright notice and this permission notice (including the
  24  * next paragraph) shall be included in all copies or substantial portions
  25  * of the Software.
  26  */
  27
  28 #include "amdgpu_cs.h"
  29
  30 #include "util/hash_table.h"
  31 #include "util/os_time.h"
  32 #include "util/u_hash_table.h"
  33 #include "frontend/drm_driver.h"
  34 #include "drm-uapi/amdgpu_drm.h"
  35 #include <xf86drm.h>
  36 #include <stdio.h>
  37 #include <inttypes.h>
  38
  39 #ifndef AMDGPU_VA_RANGE_HIGH
  40 #define AMDGPU_VA_RANGE_HIGH    0x2
  41 #endif
  42
  43 /* Set to 1 for verbose output showing committed sparse buffer ranges. */
  44 #define DEBUG_SPARSE_COMMITS 0
  45
  46 struct amdgpu_sparse_backing_chunk {
  47    uint32_t begin, end;
  48 };
  49
  50 static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
  51                            enum radeon_bo_usage usage)
  52 {
  53    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
  54    struct amdgpu_winsys *ws = bo->ws;
  55    int64_t abs_timeout;
  56
  57    if (timeout == 0) {
  58       if (p_atomic_read(&bo->num_active_ioctls))
  59          return false;
  60
  61    } else {
  62       abs_timeout = os_time_get_absolute_timeout(timeout);
  63
  64       /* Wait if any ioctl is being submitted with this buffer. */
  65       if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout))
  66          return false;
  67    }
  68
  69    if (bo->is_shared) {
  70       /* We can't use user fences for shared buffers, because user fences
  71        * are local to this process only. If we want to wait for all buffer
  72        * uses in all processes, we have to use amdgpu_bo_wait_for_idle.
  73        */
  74       bool buffer_busy = true;
  75       int r;
  76
  77       r = amdgpu_bo_wait_for_idle(bo->bo, timeout, &buffer_busy);
  78       if (r)
  79          fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__,
  80                  r);
  81       return !buffer_busy;
  82    }
  83
  84    if (timeout == 0) {
  85       unsigned idle_fences;
  86       bool buffer_idle;
  87
  88       simple_mtx_lock(&ws->bo_fence_lock);
  89
  90       for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) {
  91          if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false))
  92             break;
  93       }
  94
  95       /* Release the idle fences to avoid checking them again later. */
  96       for (unsigned i = 0; i < idle_fences; ++i)
  97          amdgpu_fence_reference(&bo->fences[i], NULL);
  98
  99       memmove(&bo->fences[0], &bo->fences[idle_fences],
 100               (bo->num_fences - idle_fences) * sizeof(*bo->fences));
 101       bo->num_fences -= idle_fences;
 102
 103       buffer_idle = !bo->num_fences;
 104       simple_mtx_unlock(&ws->bo_fence_lock);
 105
 106       return buffer_idle;
 107    } else {
 108       bool buffer_idle = true;
 109
 110       simple_mtx_lock(&ws->bo_fence_lock);
 111       while (bo->num_fences && buffer_idle) {
 112          struct pipe_fence_handle *fence = NULL;
 113          bool fence_idle = false;
 114
 115          amdgpu_fence_reference(&fence, bo->fences[0]);
 116
 117          /* Wait for the fence. */
 118          simple_mtx_unlock(&ws->bo_fence_lock);
 119          if (amdgpu_fence_wait(fence, abs_timeout, true))
 120             fence_idle = true;
 121          else
 122             buffer_idle = false;
 123          simple_mtx_lock(&ws->bo_fence_lock);
 124
 125          /* Release an idle fence to avoid checking it again later, keeping in
 126           * mind that the fence array may have been modified by other threads.
 127           */
 128          if (fence_idle && bo->num_fences && bo->fences[0] == fence) {
 129             amdgpu_fence_reference(&bo->fences[0], NULL);
 130             memmove(&bo->fences[0], &bo->fences[1],
 131                     (bo->num_fences - 1) * sizeof(*bo->fences));
 132             bo->num_fences--;
 133          }
 134
 135          amdgpu_fence_reference(&fence, NULL);
 136       }
 137       simple_mtx_unlock(&ws->bo_fence_lock);
 138
 139       return buffer_idle;
 140    }
 141 }
 142
 143 static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
 144       struct pb_buffer *buf)
 145 {
 146    return ((struct amdgpu_winsys_bo*)buf)->initial_domain;
 147 }
 148
 149 static enum radeon_bo_flag amdgpu_bo_get_flags(
 150       struct pb_buffer *buf)
 151 {
 152    return ((struct amdgpu_winsys_bo*)buf)->flags;
 153 }
 154
 155 static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)
 156 {
 157    for (unsigned i = 0; i < bo->num_fences; ++i)
 158       amdgpu_fence_reference(&bo->fences[i], NULL);
 159
 160    FREE(bo->fences);
 161    bo->num_fences = 0;
 162    bo->max_fences = 0;
 163 }
 164
 165 void amdgpu_bo_destroy(struct pb_buffer *_buf)
 166 {
 167    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
 168    struct amdgpu_screen_winsys *sws_iter;
 169    struct amdgpu_winsys *ws = bo->ws;
 170
 171    assert(bo->bo && "must not be called for slab entries");
 172
 173    if (!bo->is_user_ptr && bo->cpu_ptr) {
 174       bo->cpu_ptr = NULL;
 175       amdgpu_bo_unmap(&bo->base);
 176    }
 177    assert(bo->is_user_ptr || bo->u.real.map_count == 0);
 178
 179    if (ws->debug_all_bos) {
 180       simple_mtx_lock(&ws->global_bo_list_lock);
 181       list_del(&bo->u.real.global_list_item);
 182       ws->num_buffers--;
 183       simple_mtx_unlock(&ws->global_bo_list_lock);
 184    }
 185
 186    /* Close all KMS handles retrieved for other DRM file descriptions */
 187    simple_mtx_lock(&ws->sws_list_lock);
 188    for (sws_iter = ws->sws_list; sws_iter; sws_iter = sws_iter->next) {
 189       struct hash_entry *entry;
 190
 191       if (!sws_iter->kms_handles)
 192          continue;
 193
 194       entry = _mesa_hash_table_search(sws_iter->kms_handles, bo);
 195       if (entry) {
 196          struct drm_gem_close args = { .handle = (uintptr_t)entry->data };
 197
 198          drmIoctl(sws_iter->fd, DRM_IOCTL_GEM_CLOSE, &args);
 199          _mesa_hash_table_remove(sws_iter->kms_handles, entry);
 200       }
 201    }
 202    simple_mtx_unlock(&ws->sws_list_lock);
 203
 204    simple_mtx_lock(&ws->bo_export_table_lock);
 205    _mesa_hash_table_remove_key(ws->bo_export_table, bo->bo);
 206    simple_mtx_unlock(&ws->bo_export_table_lock);
 207
 208    if (bo->initial_domain & RADEON_DOMAIN_VRAM_GTT) {
 209       amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
 210       amdgpu_va_range_free(bo->u.real.va_handle);
 211    }
 212    amdgpu_bo_free(bo->bo);
 213
 214    amdgpu_bo_remove_fences(bo);
 215
 216    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
 217       ws->allocated_vram -= align64(bo->base.size, ws->info.gart_page_size);
 218    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
 219       ws->allocated_gtt -= align64(bo->base.size, ws->info.gart_page_size);
 220
 221    simple_mtx_destroy(&bo->lock);
 222    FREE(bo);
 223 }
 224
 225 static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf)
 226 {
 227    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
 228
 229    assert(bo->bo); /* slab buffers have a separate vtbl */
 230
 231    if (bo->u.real.use_reusable_pool)
 232       pb_cache_add_buffer(&bo->u.real.cache_entry);
 233    else
 234       amdgpu_bo_destroy(_buf);
 235 }
 236
 237 static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *ws)
 238 {
 239    for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
 240       pb_slabs_reclaim(&ws->bo_slabs[i]);
 241       if (ws->secure)
 242         pb_slabs_reclaim(&ws->bo_slabs_encrypted[i]);
 243    }
 244
 245    pb_cache_release_all_buffers(&ws->bo_cache);
 246 }
 247
 248 static bool amdgpu_bo_do_map(struct amdgpu_winsys_bo *bo, void **cpu)
 249 {
 250    assert(!bo->sparse && bo->bo && !bo->is_user_ptr);
 251    int r = amdgpu_bo_cpu_map(bo->bo, cpu);
 252    if (r) {
 253       /* Clean up buffer managers and try again. */
 254       amdgpu_clean_up_buffer_managers(bo->ws);
 255       r = amdgpu_bo_cpu_map(bo->bo, cpu);
 256       if (r)
 257          return false;
 258    }
 259
 260    if (p_atomic_inc_return(&bo->u.real.map_count) == 1) {
 261       if (bo->initial_domain & RADEON_DOMAIN_VRAM)
 262          bo->ws->mapped_vram += bo->base.size;
 263       else if (bo->initial_domain & RADEON_DOMAIN_GTT)
 264          bo->ws->mapped_gtt += bo->base.size;
 265       bo->ws->num_mapped_buffers++;
 266    }
 267
 268    return true;
 269 }
 270
 271 void *amdgpu_bo_map(struct pb_buffer *buf,
 272                     struct radeon_cmdbuf *rcs,
 273                     enum pipe_transfer_usage usage)
 274 {
 275    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
 276    struct amdgpu_winsys_bo *real;
 277    struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
 278
 279    assert(!bo->sparse);
 280
 281    /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */
 282    if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
 283       /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */
 284       if (usage & PIPE_TRANSFER_DONTBLOCK) {
 285          if (!(usage & PIPE_TRANSFER_WRITE)) {
 286             /* Mapping for read.
 287              *
 288              * Since we are mapping for read, we don't need to wait
 289              * if the GPU is using the buffer for read too
 290              * (neither one is changing it).
 291              *
 292              * Only check whether the buffer is being used for write. */
 293             if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
 294                                                                RADEON_USAGE_WRITE)) {
 295                cs->flush_cs(cs->flush_data,
 296                             RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 297                return NULL;
 298             }
 299
 300             if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
 301                                 RADEON_USAGE_WRITE)) {
 302                return NULL;
 303             }
 304          } else {
 305             if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) {
 306                cs->flush_cs(cs->flush_data,
 307                             RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 308                return NULL;
 309             }
 310
 311             if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
 312                                 RADEON_USAGE_READWRITE)) {
 313                return NULL;
 314             }
 315          }
 316       } else {
 317          uint64_t time = os_time_get_nano();
 318
 319          if (!(usage & PIPE_TRANSFER_WRITE)) {
 320             /* Mapping for read.
 321              *
 322              * Since we are mapping for read, we don't need to wait
 323              * if the GPU is using the buffer for read too
 324              * (neither one is changing it).
 325              *
 326              * Only check whether the buffer is being used for write. */
 327             if (cs) {
 328                if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
 329                                                             RADEON_USAGE_WRITE)) {
 330                   cs->flush_cs(cs->flush_data,
 331                                RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
 332                } else {
 333                   /* Try to avoid busy-waiting in amdgpu_bo_wait. */
 334                   if (p_atomic_read(&bo->num_active_ioctls))
 335                      amdgpu_cs_sync_flush(rcs);
 336                }
 337             }
 338
 339             amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
 340                            RADEON_USAGE_WRITE);
 341          } else {
 342             /* Mapping for write. */
 343             if (cs) {
 344                if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {
 345                   cs->flush_cs(cs->flush_data,
 346                                RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
 347                } else {
 348                   /* Try to avoid busy-waiting in amdgpu_bo_wait. */
 349                   if (p_atomic_read(&bo->num_active_ioctls))
 350                      amdgpu_cs_sync_flush(rcs);
 351                }
 352             }
 353
 354             amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
 355                            RADEON_USAGE_READWRITE);
 356          }
 357
 358          bo->ws->buffer_wait_time += os_time_get_nano() - time;
 359       }
 360    }
 361
 362    /* Buffer synchronization has been checked, now actually map the buffer. */
 363    void *cpu = NULL;
 364    uint64_t offset = 0;
 365
 366    if (bo->bo) {
 367       real = bo;
 368    } else {
 369       real = bo->u.slab.real;
 370       offset = bo->va - real->va;
 371    }
 372
 373    if (usage & RADEON_TRANSFER_TEMPORARY) {
 374       if (real->is_user_ptr) {
 375          cpu = real->cpu_ptr;
 376       } else {
 377          if (!amdgpu_bo_do_map(real, &cpu))
 378             return NULL;
 379       }
 380    } else {
 381       cpu = p_atomic_read(&real->cpu_ptr);
 382       if (!cpu) {
 383          simple_mtx_lock(&real->lock);
 384          /* Must re-check due to the possibility of a race. Re-check need not
 385           * be atomic thanks to the lock. */
 386          cpu = real->cpu_ptr;
 387          if (!cpu) {
 388             if (!amdgpu_bo_do_map(real, &cpu)) {
 389                simple_mtx_unlock(&real->lock);
 390                return NULL;
 391             }
 392             p_atomic_set(&real->cpu_ptr, cpu);
 393          }
 394          simple_mtx_unlock(&real->lock);
 395       }
 396    }
 397
 398    return (uint8_t*)cpu + offset;
 399 }
 400
 401 void amdgpu_bo_unmap(struct pb_buffer *buf)
 402 {
 403    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
 404    struct amdgpu_winsys_bo *real;
 405
 406    assert(!bo->sparse);
 407
 408    if (bo->is_user_ptr)
 409       return;
 410
 411    real = bo->bo ? bo : bo->u.slab.real;
 412    assert(real->u.real.map_count != 0 && "too many unmaps");
 413    if (p_atomic_dec_zero(&real->u.real.map_count)) {
 414       assert(!real->cpu_ptr &&
 415              "too many unmaps or forgot RADEON_TRANSFER_TEMPORARY flag");
 416
 417       if (real->initial_domain & RADEON_DOMAIN_VRAM)
 418          real->ws->mapped_vram -= real->base.size;
 419       else if (real->initial_domain & RADEON_DOMAIN_GTT)
 420          real->ws->mapped_gtt -= real->base.size;
 421       real->ws->num_mapped_buffers--;
 422    }
 423
 424    amdgpu_bo_cpu_unmap(real->bo);
 425 }
 426
 427 static const struct pb_vtbl amdgpu_winsys_bo_vtbl = {
 428    amdgpu_bo_destroy_or_cache
 429    /* other functions are never called */
 430 };
 431
 432 static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo)
 433 {
 434    struct amdgpu_winsys *ws = bo->ws;
 435
 436    assert(bo->bo);
 437
 438    if (ws->debug_all_bos) {
 439       simple_mtx_lock(&ws->global_bo_list_lock);
 440       list_addtail(&bo->u.real.global_list_item, &ws->global_bo_list);
 441       ws->num_buffers++;
 442       simple_mtx_unlock(&ws->global_bo_list_lock);
 443    }
 444 }
 445
 446 static uint64_t amdgpu_get_optimal_vm_alignment(struct amdgpu_winsys *ws,
 447                                                 uint64_t size, unsigned alignment)
 448 {
 449    uint64_t vm_alignment = alignment;
 450
 451    /* Increase the VM alignment for faster address translation. */
 452    if (size >= ws->info.pte_fragment_size)
 453       vm_alignment = MAX2(vm_alignment, ws->info.pte_fragment_size);
 454
 455    /* Gfx9: Increase the VM alignment to the most significant bit set
 456     * in the size for faster address translation.
 457     */
 458    if (ws->info.chip_class >= GFX9) {
 459       unsigned msb = util_last_bit64(size); /* 0 = no bit is set */
 460       uint64_t msb_alignment = msb ? 1ull << (msb - 1) : 0;
 461
 462       vm_alignment = MAX2(vm_alignment, msb_alignment);
 463    }
 464    return vm_alignment;
 465 }
 466
 467 static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
 468                                                  uint64_t size,
 469                                                  unsigned alignment,
 470                                                  enum radeon_bo_domain initial_domain,
 471                                                  unsigned flags,
 472                                                  int heap)
 473 {
 474    struct amdgpu_bo_alloc_request request = {0};
 475    amdgpu_bo_handle buf_handle;
 476    uint64_t va = 0;
 477    struct amdgpu_winsys_bo *bo;
 478    amdgpu_va_handle va_handle = NULL;
 479    int r;
 480
 481    /* VRAM or GTT must be specified, but not both at the same time. */
 482    assert(util_bitcount(initial_domain & (RADEON_DOMAIN_VRAM_GTT |
 483                                           RADEON_DOMAIN_GDS |
 484                                           RADEON_DOMAIN_OA)) == 1);
 485
 486    bo = CALLOC_STRUCT(amdgpu_winsys_bo);
 487    if (!bo) {
 488       return NULL;
 489    }
 490
 491    if (heap >= 0) {
 492       pb_cache_init_entry(&ws->bo_cache, &bo->u.real.cache_entry, &bo->base,
 493                           heap);
 494    }
 495    request.alloc_size = size;
 496    request.phys_alignment = alignment;
 497
 498    if (initial_domain & RADEON_DOMAIN_VRAM) {
 499       request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
 500
 501       /* Since VRAM and GTT have almost the same performance on APUs, we could
 502        * just set GTT. However, in order to decrease GTT(RAM) usage, which is
 503        * shared with the OS, allow VRAM placements too. The idea is not to use
 504        * VRAM usefully, but to use it so that it's not unused and wasted.
 505        */
 506       if (!ws->info.has_dedicated_vram)
 507          request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
 508    }
 509
 510    if (initial_domain & RADEON_DOMAIN_GTT)
 511       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
 512    if (initial_domain & RADEON_DOMAIN_GDS)
 513       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GDS;
 514    if (initial_domain & RADEON_DOMAIN_OA)
 515       request.preferred_heap |= AMDGPU_GEM_DOMAIN_OA;
 516
 517    if (flags & RADEON_FLAG_NO_CPU_ACCESS)
 518       request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
 519    if (flags & RADEON_FLAG_GTT_WC)
 520       request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
 521    if (ws->zero_all_vram_allocs &&
 522        (request.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM))
 523       request.flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED;
 524    if ((flags & RADEON_FLAG_ENCRYPTED) && ws->secure)
 525       request.flags |= AMDGPU_GEM_CREATE_ENCRYPTED;
 526
 527    r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
 528    if (r) {
 529       fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
 530       fprintf(stderr, "amdgpu:    size      : %"PRIu64" bytes\n", size);
 531       fprintf(stderr, "amdgpu:    alignment : %u bytes\n", alignment);
 532       fprintf(stderr, "amdgpu:    domains   : %u\n", initial_domain);
 533       fprintf(stderr, "amdgpu:    flags   : %" PRIx64 "\n", request.flags);
 534       goto error_bo_alloc;
 535    }
 536
 537    if (initial_domain & RADEON_DOMAIN_VRAM_GTT) {
 538       unsigned va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
 539
 540       r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
 541                                 size + va_gap_size,
 542                                 amdgpu_get_optimal_vm_alignment(ws, size, alignment),
 543                                 0, &va, &va_handle,
 544                                 (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
 545                                 AMDGPU_VA_RANGE_HIGH);
 546       if (r)
 547          goto error_va_alloc;
 548
 549       unsigned vm_flags = AMDGPU_VM_PAGE_READABLE |
 550                           AMDGPU_VM_PAGE_EXECUTABLE;
 551
 552       if (!(flags & RADEON_FLAG_READ_ONLY))
 553          vm_flags |= AMDGPU_VM_PAGE_WRITEABLE;
 554
 555       if (flags & RADEON_FLAG_UNCACHED)
 556          vm_flags |= AMDGPU_VM_MTYPE_UC;
 557
 558       r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags,
 559                            AMDGPU_VA_OP_MAP);
 560       if (r)
 561          goto error_va_map;
 562    }
 563
 564    simple_mtx_init(&bo->lock, mtx_plain);
 565    pipe_reference_init(&bo->base.reference, 1);
 566    bo->base.alignment = alignment;
 567    bo->base.usage = 0;
 568    bo->base.size = size;
 569    bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
 570    bo->ws = ws;
 571    bo->bo = buf_handle;
 572    bo->va = va;
 573    bo->u.real.va_handle = va_handle;
 574    bo->initial_domain = initial_domain;
 575    bo->flags = flags;
 576    bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
 577
 578    if (initial_domain & RADEON_DOMAIN_VRAM)
 579       ws->allocated_vram += align64(size, ws->info.gart_page_size);
 580    else if (initial_domain & RADEON_DOMAIN_GTT)
 581       ws->allocated_gtt += align64(size, ws->info.gart_page_size);
 582
 583    amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
 584
 585    amdgpu_add_buffer_to_global_list(bo);
 586
 587    return bo;
 588
 589 error_va_map:
 590    amdgpu_va_range_free(va_handle);
 591
 592 error_va_alloc:
 593    amdgpu_bo_free(buf_handle);
 594
 595 error_bo_alloc:
 596    FREE(bo);
 597    return NULL;
 598 }
 599
 600 bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf)
 601 {
 602    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
 603
 604    if (amdgpu_bo_is_referenced_by_any_cs(bo)) {
 605       return false;
 606    }
 607
 608    return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE);
 609 }
 610
 611 bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
 612 {
 613    struct amdgpu_winsys_bo *bo = NULL; /* fix container_of */
 614    bo = container_of(entry, bo, u.slab.entry);
 615
 616    return amdgpu_bo_can_reclaim(&bo->base);
 617 }
 618
 619 static struct pb_slabs *get_slabs(struct amdgpu_winsys *ws, uint64_t size,
 620                                   enum radeon_bo_flag flags)
 621 {
 622    struct pb_slabs *bo_slabs = ((flags & RADEON_FLAG_ENCRYPTED) && ws->secure) ?
 623       ws->bo_slabs_encrypted : ws->bo_slabs;
 624    /* Find the correct slab allocator for the given size. */
 625    for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
 626       struct pb_slabs *slabs = &bo_slabs[i];
 627
 628       if (size <= 1 << (slabs->min_order + slabs->num_orders - 1))
 629          return slabs;
 630    }
 631
 632    assert(0);
 633    return NULL;
 634 }
 635
 636 static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf)
 637 {
 638    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
 639
 640    assert(!bo->bo);
 641
 642    if (bo->flags & RADEON_FLAG_ENCRYPTED)
 643       pb_slab_free(get_slabs(bo->ws,
 644                              bo->base.size,
 645                              RADEON_FLAG_ENCRYPTED), &bo->u.slab.entry);
 646    else
 647       pb_slab_free(get_slabs(bo->ws,
 648                              bo->base.size,
 649                              0), &bo->u.slab.entry);
 650 }
 651
 652 static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = {
 653    amdgpu_bo_slab_destroy
 654    /* other functions are never called */
 655 };
 656
 657 static struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
 658                                             unsigned entry_size,
 659                                             unsigned group_index,
 660                                             bool encrypted)
 661 {
 662    struct amdgpu_winsys *ws = priv;
 663    struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab);
 664    enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
 665    enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
 666    uint32_t base_id;
 667    unsigned slab_size = 0;
 668
 669    if (!slab)
 670       return NULL;
 671
 672    if (encrypted)
 673       flags |= RADEON_FLAG_ENCRYPTED;
 674
 675    struct pb_slabs *slabs = (flags & RADEON_FLAG_ENCRYPTED && ws->secure) ?
 676       ws->bo_slabs_encrypted : ws->bo_slabs;
 677
 678    /* Determine the slab buffer size. */
 679    for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
 680       unsigned max_entry_size = 1 << (slabs[i].min_order + slabs[i].num_orders - 1);
 681
 682       if (entry_size <= max_entry_size) {
 683          /* The slab size is twice the size of the largest possible entry. */
 684          slab_size = max_entry_size * 2;
 685
 686          /* The largest slab should have the same size as the PTE fragment
 687           * size to get faster address translation.
 688           */
 689          if (i == NUM_SLAB_ALLOCATORS - 1 &&
 690              slab_size < ws->info.pte_fragment_size)
 691             slab_size = ws->info.pte_fragment_size;
 692          break;
 693       }
 694    }
 695    assert(slab_size != 0);
 696
 697    slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(ws,
 698                                                     slab_size, slab_size,
 699                                                     domains, flags));
 700    if (!slab->buffer)
 701       goto fail;
 702
 703    slab->base.num_entries = slab->buffer->base.size / entry_size;
 704    slab->base.num_free = slab->base.num_entries;
 705    slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));
 706    if (!slab->entries)
 707       goto fail_buffer;
 708
 709    list_inithead(&slab->base.free);
 710
 711    base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, slab->base.num_entries);
 712
 713    for (unsigned i = 0; i < slab->base.num_entries; ++i) {
 714       struct amdgpu_winsys_bo *bo = &slab->entries[i];
 715
 716       simple_mtx_init(&bo->lock, mtx_plain);
 717       bo->base.alignment = entry_size;
 718       bo->base.usage = slab->buffer->base.usage;
 719       bo->base.size = entry_size;
 720       bo->base.vtbl = &amdgpu_winsys_bo_slab_vtbl;
 721       bo->ws = ws;
 722       bo->va = slab->buffer->va + i * entry_size;
 723       bo->initial_domain = domains;
 724       bo->unique_id = base_id + i;
 725       bo->u.slab.entry.slab = &slab->base;
 726       bo->u.slab.entry.group_index = group_index;
 727
 728       if (slab->buffer->bo) {
 729          /* The slab is not suballocated. */
 730          bo->u.slab.real = slab->buffer;
 731       } else {
 732          /* The slab is allocated out of a bigger slab. */
 733          bo->u.slab.real = slab->buffer->u.slab.real;
 734          assert(bo->u.slab.real->bo);
 735       }
 736
 737       list_addtail(&bo->u.slab.entry.head, &slab->base.free);
 738    }
 739
 740    return &slab->base;
 741
 742 fail_buffer:
 743    amdgpu_winsys_bo_reference(&slab->buffer, NULL);
 744 fail:
 745    FREE(slab);
 746    return NULL;
 747 }
 748
 749 struct pb_slab *amdgpu_bo_slab_alloc_encrypted(void *priv, unsigned heap,
 750                                                unsigned entry_size,
 751                                                unsigned group_index)
 752 {
 753    return amdgpu_bo_slab_alloc(priv, heap, entry_size, group_index, true);
 754 }
 755
 756 struct pb_slab *amdgpu_bo_slab_alloc_normal(void *priv, unsigned heap,
 757                                             unsigned entry_size,
 758                                             unsigned group_index)
 759 {
 760    return amdgpu_bo_slab_alloc(priv, heap, entry_size, group_index, false);
 761 }
 762
 763 void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab)
 764 {
 765    struct amdgpu_slab *slab = amdgpu_slab(pslab);
 766
 767    for (unsigned i = 0; i < slab->base.num_entries; ++i) {
 768       amdgpu_bo_remove_fences(&slab->entries[i]);
 769       simple_mtx_destroy(&slab->entries[i].lock);
 770    }
 771
 772    FREE(slab->entries);
 773    amdgpu_winsys_bo_reference(&slab->buffer, NULL);
 774    FREE(slab);
 775 }
 776
 777 #if DEBUG_SPARSE_COMMITS
 778 static void
 779 sparse_dump(struct amdgpu_winsys_bo *bo, const char *func)
 780 {
 781    fprintf(stderr, "%s: %p (size=%"PRIu64", num_va_pages=%u) @ %s\n"
 782                    "Commitments:\n",
 783            __func__, bo, bo->base.size, bo->u.sparse.num_va_pages, func);
 784
 785    struct amdgpu_sparse_backing *span_backing = NULL;
 786    uint32_t span_first_backing_page = 0;
 787    uint32_t span_first_va_page = 0;
 788    uint32_t va_page = 0;
 789
 790    for (;;) {
 791       struct amdgpu_sparse_backing *backing = 0;
 792       uint32_t backing_page = 0;
 793
 794       if (va_page < bo->u.sparse.num_va_pages) {
 795          backing = bo->u.sparse.commitments[va_page].backing;
 796          backing_page = bo->u.sparse.commitments[va_page].page;
 797       }
 798
 799       if (span_backing &&
 800           (backing != span_backing ||
 801            backing_page != span_first_backing_page + (va_page - span_first_va_page))) {
 802          fprintf(stderr, " %u..%u: backing=%p:%u..%u\n",
 803                  span_first_va_page, va_page - 1, span_backing,
 804                  span_first_backing_page,
 805                  span_first_backing_page + (va_page - span_first_va_page) - 1);
 806
 807          span_backing = NULL;
 808       }
 809
 810       if (va_page >= bo->u.sparse.num_va_pages)
 811          break;
 812
 813       if (backing && !span_backing) {
 814          span_backing = backing;
 815          span_first_backing_page = backing_page;
 816          span_first_va_page = va_page;
 817       }
 818
 819       va_page++;
 820    }
 821
 822    fprintf(stderr, "Backing:\n");
 823
 824    list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
 825       fprintf(stderr, " %p (size=%"PRIu64")\n", backing, backing->bo->base.size);
 826       for (unsigned i = 0; i < backing->num_chunks; ++i)
 827          fprintf(stderr, "   %u..%u\n", backing->chunks[i].begin, backing->chunks[i].end);
 828    }
 829 }
 830 #endif
 831
 832 /*
 833  * Attempt to allocate the given number of backing pages. Fewer pages may be
 834  * allocated (depending on the fragmentation of existing backing buffers),
 835  * which will be reflected by a change to *pnum_pages.
 836  */
 837 static struct amdgpu_sparse_backing *
 838 sparse_backing_alloc(struct amdgpu_winsys_bo *bo, uint32_t *pstart_page, uint32_t *pnum_pages)
 839 {
 840    struct amdgpu_sparse_backing *best_backing;
 841    unsigned best_idx;
 842    uint32_t best_num_pages;
 843
 844    best_backing = NULL;
 845    best_idx = 0;
 846    best_num_pages = 0;
 847
 848    /* This is a very simple and inefficient best-fit algorithm. */
 849    list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
 850       for (unsigned idx = 0; idx < backing->num_chunks; ++idx) {
 851          uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin;
 852          if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) ||
 853             (best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) {
 854             best_backing = backing;
 855             best_idx = idx;
 856             best_num_pages = cur_num_pages;
 857          }
 858       }
 859    }
 860
 861    /* Allocate a new backing buffer if necessary. */
 862    if (!best_backing) {
 863       struct pb_buffer *buf;
 864       uint64_t size;
 865       uint32_t pages;
 866
 867       best_backing = CALLOC_STRUCT(amdgpu_sparse_backing);
 868       if (!best_backing)
 869          return NULL;
 870
 871       best_backing->max_chunks = 4;
 872       best_backing->chunks = CALLOC(best_backing->max_chunks,
 873                                     sizeof(*best_backing->chunks));
 874       if (!best_backing->chunks) {
 875          FREE(best_backing);
 876          return NULL;
 877       }
 878
 879       assert(bo->u.sparse.num_backing_pages < DIV_ROUND_UP(bo->base.size, RADEON_SPARSE_PAGE_SIZE));
 880
 881       size = MIN3(bo->base.size / 16,
 882                   8 * 1024 * 1024,
 883                   bo->base.size - (uint64_t)bo->u.sparse.num_backing_pages * RADEON_SPARSE_PAGE_SIZE);
 884       size = MAX2(size, RADEON_SPARSE_PAGE_SIZE);
 885
 886       buf = amdgpu_bo_create(bo->ws, size, RADEON_SPARSE_PAGE_SIZE,
 887                              bo->initial_domain,
 888                              bo->u.sparse.flags | RADEON_FLAG_NO_SUBALLOC);
 889       if (!buf) {
 890          FREE(best_backing->chunks);
 891          FREE(best_backing);
 892          return NULL;
 893       }
 894
 895       /* We might have gotten a bigger buffer than requested via caching. */
 896       pages = buf->size / RADEON_SPARSE_PAGE_SIZE;
 897
 898       best_backing->bo = amdgpu_winsys_bo(buf);
 899       best_backing->num_chunks = 1;
 900       best_backing->chunks[0].begin = 0;
 901       best_backing->chunks[0].end = pages;
 902
 903       list_add(&best_backing->list, &bo->u.sparse.backing);
 904       bo->u.sparse.num_backing_pages += pages;
 905
 906       best_idx = 0;
 907       best_num_pages = pages;
 908    }
 909
 910    *pnum_pages = MIN2(*pnum_pages, best_num_pages);
 911    *pstart_page = best_backing->chunks[best_idx].begin;
 912    best_backing->chunks[best_idx].begin += *pnum_pages;
 913
 914    if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) {
 915       memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1],
 916               sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1));
 917       best_backing->num_chunks--;
 918    }
 919
 920    return best_backing;
 921 }
 922
 923 static void
 924 sparse_free_backing_buffer(struct amdgpu_winsys_bo *bo,
 925                            struct amdgpu_sparse_backing *backing)
 926 {
 927    struct amdgpu_winsys *ws = backing->bo->ws;
 928
 929    bo->u.sparse.num_backing_pages -= backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE;
 930
 931    simple_mtx_lock(&ws->bo_fence_lock);
 932    amdgpu_add_fences(backing->bo, bo->num_fences, bo->fences);
 933    simple_mtx_unlock(&ws->bo_fence_lock);
 934
 935    list_del(&backing->list);
 936    amdgpu_winsys_bo_reference(&backing->bo, NULL);
 937    FREE(backing->chunks);
 938    FREE(backing);
 939 }
 940
 941 /*
 942  * Return a range of pages from the given backing buffer back into the
 943  * free structure.
 944  */
 945 static bool
 946 sparse_backing_free(struct amdgpu_winsys_bo *bo,
 947                     struct amdgpu_sparse_backing *backing,
 948                     uint32_t start_page, uint32_t num_pages)
 949 {
 950    uint32_t end_page = start_page + num_pages;
 951    unsigned low = 0;
 952    unsigned high = backing->num_chunks;
 953
 954    /* Find the first chunk with begin >= start_page. */
 955    while (low < high) {
 956       unsigned mid = low + (high - low) / 2;
 957
 958       if (backing->chunks[mid].begin >= start_page)
 959          high = mid;
 960       else
 961          low = mid + 1;
 962    }
 963
 964    assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin);
 965    assert(low == 0 || backing->chunks[low - 1].end <= start_page);
 966
 967    if (low > 0 && backing->chunks[low - 1].end == start_page) {
 968       backing->chunks[low - 1].end = end_page;
 969
 970       if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
 971          backing->chunks[low - 1].end = backing->chunks[low].end;
 972          memmove(&backing->chunks[low], &backing->chunks[low + 1],
 973                  sizeof(*backing->chunks) * (backing->num_chunks - low - 1));
 974          backing->num_chunks--;
 975       }
 976    } else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
 977       backing->chunks[low].begin = start_page;
 978    } else {
 979       if (backing->num_chunks >= backing->max_chunks) {
 980          unsigned new_max_chunks = 2 * backing->max_chunks;
 981          struct amdgpu_sparse_backing_chunk *new_chunks =
 982             REALLOC(backing->chunks,
 983                     sizeof(*backing->chunks) * backing->max_chunks,
 984                     sizeof(*backing->chunks) * new_max_chunks);
 985          if (!new_chunks)
 986             return false;
 987
 988          backing->max_chunks = new_max_chunks;
 989          backing->chunks = new_chunks;
 990       }
 991
 992       memmove(&backing->chunks[low + 1], &backing->chunks[low],
 993               sizeof(*backing->chunks) * (backing->num_chunks - low));
 994       backing->chunks[low].begin = start_page;
 995       backing->chunks[low].end = end_page;
 996       backing->num_chunks++;
 997    }
 998
 999    if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 &&
1000        backing->chunks[0].end == backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE)
1001       sparse_free_backing_buffer(bo, backing);
1002
1003    return true;
1004 }
1005
1006 static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf)
1007 {
1008    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
1009    int r;
1010
1011    assert(!bo->bo && bo->sparse);
1012
1013    r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0,
1014                            (uint64_t)bo->u.sparse.num_va_pages * RADEON_SPARSE_PAGE_SIZE,
1015                            bo->va, 0, AMDGPU_VA_OP_CLEAR);
1016    if (r) {
1017       fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r);
1018    }
1019
1020    while (!list_is_empty(&bo->u.sparse.backing)) {
1021       struct amdgpu_sparse_backing *dummy = NULL;
1022       sparse_free_backing_buffer(bo,
1023                                  container_of(bo->u.sparse.backing.next,
1024                                               dummy, list));
1025    }
1026
1027    amdgpu_va_range_free(bo->u.sparse.va_handle);
1028    FREE(bo->u.sparse.commitments);
1029    simple_mtx_destroy(&bo->lock);
1030    FREE(bo);
1031 }
1032
1033 static const struct pb_vtbl amdgpu_winsys_bo_sparse_vtbl = {
1034    amdgpu_bo_sparse_destroy
1035    /* other functions are never called */
1036 };
1037
1038 static struct pb_buffer *
1039 amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size,
1040                         enum radeon_bo_domain domain,
1041                         enum radeon_bo_flag flags)
1042 {
1043    struct amdgpu_winsys_bo *bo;
1044    uint64_t map_size;
1045    uint64_t va_gap_size;
1046    int r;
1047
1048    /* We use 32-bit page numbers; refuse to attempt allocating sparse buffers
1049     * that exceed this limit. This is not really a restriction: we don't have
1050     * that much virtual address space anyway.
1051     */
1052    if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE)
1053       return NULL;
1054
1055    bo = CALLOC_STRUCT(amdgpu_winsys_bo);
1056    if (!bo)
1057       return NULL;
1058
1059    simple_mtx_init(&bo->lock, mtx_plain);
1060    pipe_reference_init(&bo->base.reference, 1);
1061    bo->base.alignment = RADEON_SPARSE_PAGE_SIZE;
1062    bo->base.size = size;
1063    bo->base.vtbl = &amdgpu_winsys_bo_sparse_vtbl;
1064    bo->ws = ws;
1065    bo->initial_domain = domain;
1066    bo->unique_id =  __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
1067    bo->sparse = true;
1068    bo->u.sparse.flags = flags & ~RADEON_FLAG_SPARSE;
1069
1070    bo->u.sparse.num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
1071    bo->u.sparse.commitments = CALLOC(bo->u.sparse.num_va_pages,
1072                                      sizeof(*bo->u.sparse.commitments));
1073    if (!bo->u.sparse.commitments)
1074       goto error_alloc_commitments;
1075
1076    list_inithead(&bo->u.sparse.backing);
1077
1078    /* For simplicity, we always map a multiple of the page size. */
1079    map_size = align64(size, RADEON_SPARSE_PAGE_SIZE);
1080    va_gap_size = ws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0;
1081    r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
1082                              map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE,
1083                              0, &bo->va, &bo->u.sparse.va_handle,
1084                              AMDGPU_VA_RANGE_HIGH);
1085    if (r)
1086       goto error_va_alloc;
1087
1088    r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, size, bo->va,
1089                            AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP);
1090    if (r)
1091       goto error_va_map;
1092
1093    return &bo->base;
1094
1095 error_va_map:
1096    amdgpu_va_range_free(bo->u.sparse.va_handle);
1097 error_va_alloc:
1098    FREE(bo->u.sparse.commitments);
1099 error_alloc_commitments:
1100    simple_mtx_destroy(&bo->lock);
1101    FREE(bo);
1102    return NULL;
1103 }
1104
1105 static bool
1106 amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size,
1107                         bool commit)
1108 {
1109    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf);
1110    struct amdgpu_sparse_commitment *comm;
1111    uint32_t va_page, end_va_page;
1112    bool ok = true;
1113    int r;
1114
1115    assert(bo->sparse);
1116    assert(offset % RADEON_SPARSE_PAGE_SIZE == 0);
1117    assert(offset <= bo->base.size);
1118    assert(size <= bo->base.size - offset);
1119    assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->base.size);
1120
1121    comm = bo->u.sparse.commitments;
1122    va_page = offset / RADEON_SPARSE_PAGE_SIZE;
1123    end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
1124
1125    simple_mtx_lock(&bo->lock);
1126
1127 #if DEBUG_SPARSE_COMMITS
1128    sparse_dump(bo, __func__);
1129 #endif
1130
1131    if (commit) {
1132       while (va_page < end_va_page) {
1133          uint32_t span_va_page;
1134
1135          /* Skip pages that are already committed. */
1136          if (comm[va_page].backing) {
1137             va_page++;
1138             continue;
1139          }
1140
1141          /* Determine length of uncommitted span. */
1142          span_va_page = va_page;
1143          while (va_page < end_va_page && !comm[va_page].backing)
1144             va_page++;
1145
1146          /* Fill the uncommitted span with chunks of backing memory. */
1147          while (span_va_page < va_page) {
1148             struct amdgpu_sparse_backing *backing;
1149             uint32_t backing_start, backing_size;
1150
1151             backing_size = va_page - span_va_page;
1152             backing = sparse_backing_alloc(bo, &backing_start, &backing_size);
1153             if (!backing) {
1154                ok = false;
1155                goto out;
1156             }
1157
1158             r = amdgpu_bo_va_op_raw(bo->ws->dev, backing->bo->bo,
1159                                     (uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE,
1160                                     (uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE,
1161                                     bo->va + (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE,
1162                                     AMDGPU_VM_PAGE_READABLE |
1163                                     AMDGPU_VM_PAGE_WRITEABLE |
1164                                     AMDGPU_VM_PAGE_EXECUTABLE,
1165                                     AMDGPU_VA_OP_REPLACE);
1166             if (r) {
1167                ok = sparse_backing_free(bo, backing, backing_start, backing_size);
1168                assert(ok && "sufficient memory should already be allocated");
1169
1170                ok = false;
1171                goto out;
1172             }
1173
1174             while (backing_size) {
1175                comm[span_va_page].backing = backing;
1176                comm[span_va_page].page = backing_start;
1177                span_va_page++;
1178                backing_start++;
1179                backing_size--;
1180             }
1181          }
1182       }
1183    } else {
1184       r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0,
1185                               (uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE,
1186                               bo->va + (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE,
1187                               AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE);
1188       if (r) {
1189          ok = false;
1190          goto out;
1191       }
1192
1193       while (va_page < end_va_page) {
1194          struct amdgpu_sparse_backing *backing;
1195          uint32_t backing_start;
1196          uint32_t span_pages;
1197
1198          /* Skip pages that are already uncommitted. */
1199          if (!comm[va_page].backing) {
1200             va_page++;
1201             continue;
1202          }
1203
1204          /* Group contiguous spans of pages. */
1205          backing = comm[va_page].backing;
1206          backing_start = comm[va_page].page;
1207          comm[va_page].backing = NULL;
1208
1209          span_pages = 1;
1210          va_page++;
1211
1212          while (va_page < end_va_page &&
1213                 comm[va_page].backing == backing &&
1214                 comm[va_page].page == backing_start + span_pages) {
1215             comm[va_page].backing = NULL;
1216             va_page++;
1217             span_pages++;
1218          }
1219
1220          if (!sparse_backing_free(bo, backing, backing_start, span_pages)) {
1221             /* Couldn't allocate tracking data structures, so we have to leak */
1222             fprintf(stderr, "amdgpu: leaking PRT backing memory\n");
1223             ok = false;
1224          }
1225       }
1226    }
1227 out:
1228
1229    simple_mtx_unlock(&bo->lock);
1230
1231    return ok;
1232 }
1233
1234 static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf,
1235                                        struct radeon_bo_metadata *md,
1236                                        struct radeon_surf *surf)
1237 {
1238    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
1239    struct amdgpu_bo_info info = {0};
1240    int r;
1241
1242    assert(bo->bo && "must not be called for slab entries");
1243
1244    r = amdgpu_bo_query_info(bo->bo, &info);
1245    if (r)
1246       return;
1247
1248    ac_surface_set_bo_metadata(&bo->ws->info, surf, info.metadata.tiling_info,
1249                               &md->mode);
1250
1251    md->size_metadata = info.metadata.size_metadata;
1252    memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata));
1253 }
1254
1255 static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
1256                                        struct radeon_bo_metadata *md,
1257                                        struct radeon_surf *surf)
1258 {
1259    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
1260    struct amdgpu_bo_metadata metadata = {0};
1261
1262    assert(bo->bo && "must not be called for slab entries");
1263
1264    ac_surface_get_bo_metadata(&bo->ws->info, surf, &metadata.tiling_info);
1265
1266    metadata.size_metadata = md->size_metadata;
1267    memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata));
1268
1269    amdgpu_bo_set_metadata(bo->bo, &metadata);
1270 }
1271
1272 struct pb_buffer *
1273 amdgpu_bo_create(struct amdgpu_winsys *ws,
1274                  uint64_t size,
1275                  unsigned alignment,
1276                  enum radeon_bo_domain domain,
1277                  enum radeon_bo_flag flags)
1278 {
1279    struct amdgpu_winsys_bo *bo;
1280    int heap = -1;
1281
1282    if (domain & (RADEON_DOMAIN_GDS | RADEON_DOMAIN_OA))
1283       flags |= RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_NO_SUBALLOC;
1284
1285    /* VRAM implies WC. This is not optional. */
1286    assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
1287
1288    /* NO_CPU_ACCESS is not valid with GTT. */
1289    assert(!(domain & RADEON_DOMAIN_GTT) || !(flags & RADEON_FLAG_NO_CPU_ACCESS));
1290
1291    /* Sparse buffers must have NO_CPU_ACCESS set. */
1292    assert(!(flags & RADEON_FLAG_SPARSE) || flags & RADEON_FLAG_NO_CPU_ACCESS);
1293
1294    struct pb_slabs *slabs = (flags & RADEON_FLAG_ENCRYPTED && ws->secure) ?
1295       ws->bo_slabs_encrypted : ws->bo_slabs;
1296    struct pb_slabs *last_slab = &slabs[NUM_SLAB_ALLOCATORS - 1];
1297    unsigned max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1);
1298
1299    /* Sub-allocate small buffers from slabs. */
1300    if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&
1301        size <= max_slab_entry_size &&
1302        /* The alignment must be at most the size of the smallest slab entry or
1303         * the next power of two. */
1304        alignment <= MAX2(1 << slabs[0].min_order, util_next_power_of_two(size))) {
1305       struct pb_slab_entry *entry;
1306       int heap = radeon_get_heap_index(domain, flags);
1307
1308       if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS)
1309          goto no_slab;
1310
1311       struct pb_slabs *slabs = get_slabs(ws, size, flags);
1312       entry = pb_slab_alloc(slabs, size, heap);
1313       if (!entry) {
1314          /* Clean up buffer managers and try again. */
1315          amdgpu_clean_up_buffer_managers(ws);
1316
1317          entry = pb_slab_alloc(slabs, size, heap);
1318       }
1319       if (!entry)
1320          return NULL;
1321
1322       bo = NULL;
1323       bo = container_of(entry, bo, u.slab.entry);
1324
1325       pipe_reference_init(&bo->base.reference, 1);
1326
1327       return &bo->base;
1328    }
1329 no_slab:
1330
1331    if (flags & RADEON_FLAG_SPARSE) {
1332       assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0);
1333
1334       return amdgpu_bo_sparse_create(ws, size, domain, flags);
1335    }
1336
1337    /* This flag is irrelevant for the cache. */
1338    flags &= ~RADEON_FLAG_NO_SUBALLOC;
1339
1340    /* Align size to page size. This is the minimum alignment for normal
1341     * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
1342     * like constant/uniform buffers, can benefit from better and more reuse.
1343     */
1344    if (domain & RADEON_DOMAIN_VRAM_GTT) {
1345       size = align64(size, ws->info.gart_page_size);
1346       alignment = align(alignment, ws->info.gart_page_size);
1347    }
1348
1349    bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
1350
1351    if (use_reusable_pool) {
1352        heap = radeon_get_heap_index(domain, flags & ~RADEON_FLAG_ENCRYPTED);
1353        assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
1354
1355        /* Get a buffer from the cache. */
1356        bo = (struct amdgpu_winsys_bo*)
1357             pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, 0, heap);
1358        if (bo)
1359           return &bo->base;
1360    }
1361
1362    /* Create a new one. */
1363    bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap);
1364    if (!bo) {
1365       /* Clean up buffer managers and try again. */
1366       amdgpu_clean_up_buffer_managers(ws);
1367
1368       bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap);
1369       if (!bo)
1370          return NULL;
1371    }
1372
1373    bo->u.real.use_reusable_pool = use_reusable_pool;
1374    return &bo->base;
1375 }
1376
1377 static struct pb_buffer *
1378 amdgpu_buffer_create(struct radeon_winsys *ws,
1379                      uint64_t size,
1380                      unsigned alignment,
1381                      enum radeon_bo_domain domain,
1382                      enum radeon_bo_flag flags)
1383 {
1384    struct pb_buffer * res = amdgpu_bo_create(amdgpu_winsys(ws), size, alignment, domain,
1385                            flags);
1386    return res;
1387 }
1388
1389 static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
1390                                                struct winsys_handle *whandle,
1391                                                unsigned vm_alignment)
1392 {
1393    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
1394    struct amdgpu_winsys_bo *bo = NULL;
1395    enum amdgpu_bo_handle_type type;
1396    struct amdgpu_bo_import_result result = {0};
1397    uint64_t va;
1398    amdgpu_va_handle va_handle = NULL;
1399    struct amdgpu_bo_info info = {0};
1400    enum radeon_bo_domain initial = 0;
1401    enum radeon_bo_flag flags = 0;
1402    int r;
1403
1404    switch (whandle->type) {
1405    case WINSYS_HANDLE_TYPE_SHARED:
1406       type = amdgpu_bo_handle_type_gem_flink_name;
1407       break;
1408    case WINSYS_HANDLE_TYPE_FD:
1409       type = amdgpu_bo_handle_type_dma_buf_fd;
1410       break;
1411    default:
1412       return NULL;
1413    }
1414
1415    r = amdgpu_bo_import(ws->dev, type, whandle->handle, &result);
1416    if (r)
1417       return NULL;
1418
1419    simple_mtx_lock(&ws->bo_export_table_lock);
1420    bo = util_hash_table_get(ws->bo_export_table, result.buf_handle);
1421
1422    /* If the amdgpu_winsys_bo instance already exists, bump the reference
1423     * counter and return it.
1424     */
1425    if (bo) {
1426       p_atomic_inc(&bo->base.reference.count);
1427       simple_mtx_unlock(&ws->bo_export_table_lock);
1428
1429       /* Release the buffer handle, because we don't need it anymore.
1430        * This function is returning an existing buffer, which has its own
1431        * handle.
1432        */
1433       amdgpu_bo_free(result.buf_handle);
1434       return &bo->base;
1435    }
1436
1437    /* Get initial domains. */
1438    r = amdgpu_bo_query_info(result.buf_handle, &info);
1439    if (r)
1440       goto error;
1441
1442    r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
1443                              result.alloc_size,
1444                              amdgpu_get_optimal_vm_alignment(ws, result.alloc_size,
1445                                                              vm_alignment),
1446                              0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH);
1447    if (r)
1448       goto error;
1449
1450    bo = CALLOC_STRUCT(amdgpu_winsys_bo);
1451    if (!bo)
1452       goto error;
1453
1454    r = amdgpu_bo_va_op(result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP);
1455    if (r)
1456       goto error;
1457
1458    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
1459       initial |= RADEON_DOMAIN_VRAM;
1460    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
1461       initial |= RADEON_DOMAIN_GTT;
1462    if (info.alloc_flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS)
1463       flags |= RADEON_FLAG_NO_CPU_ACCESS;
1464    if (info.alloc_flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC)
1465       flags |= RADEON_FLAG_GTT_WC;
1466    if (info.alloc_flags & AMDGPU_GEM_CREATE_ENCRYPTED)
1467       flags |= RADEON_FLAG_ENCRYPTED;
1468
1469    /* Initialize the structure. */
1470    simple_mtx_init(&bo->lock, mtx_plain);
1471    pipe_reference_init(&bo->base.reference, 1);
1472    bo->base.alignment = info.phys_alignment;
1473    bo->bo = result.buf_handle;
1474    bo->base.size = result.alloc_size;
1475    bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
1476    bo->ws = ws;
1477    bo->va = va;
1478    bo->u.real.va_handle = va_handle;
1479    bo->initial_domain = initial;
1480    bo->flags = flags;
1481    bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
1482    bo->is_shared = true;
1483
1484    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
1485       ws->allocated_vram += align64(bo->base.size, ws->info.gart_page_size);
1486    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
1487       ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size);
1488
1489    amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
1490
1491    amdgpu_add_buffer_to_global_list(bo);
1492
1493    _mesa_hash_table_insert(ws->bo_export_table, bo->bo, bo);
1494    simple_mtx_unlock(&ws->bo_export_table_lock);
1495
1496    return &bo->base;
1497
1498 error:
1499    simple_mtx_unlock(&ws->bo_export_table_lock);
1500    if (bo)
1501       FREE(bo);
1502    if (va_handle)
1503       amdgpu_va_range_free(va_handle);
1504    amdgpu_bo_free(result.buf_handle);
1505    return NULL;
1506 }
1507
1508 static bool amdgpu_bo_get_handle(struct radeon_winsys *rws,
1509                                  struct pb_buffer *buffer,
1510                                  struct winsys_handle *whandle)
1511 {
1512    struct amdgpu_screen_winsys *sws = amdgpu_screen_winsys(rws);
1513    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer);
1514    struct amdgpu_winsys *ws = bo->ws;
1515    enum amdgpu_bo_handle_type type;
1516    struct hash_entry *entry;
1517    int r;
1518
1519    /* Don't allow exports of slab entries and sparse buffers. */
1520    if (!bo->bo)
1521       return false;
1522
1523    bo->u.real.use_reusable_pool = false;
1524
1525    switch (whandle->type) {
1526    case WINSYS_HANDLE_TYPE_SHARED:
1527       type = amdgpu_bo_handle_type_gem_flink_name;
1528       break;
1529    case WINSYS_HANDLE_TYPE_KMS:
1530       if (sws->fd == ws->fd) {
1531          whandle->handle = bo->u.real.kms_handle;
1532
1533          if (bo->is_shared)
1534             return true;
1535
1536          goto hash_table_set;
1537       }
1538
1539       simple_mtx_lock(&ws->sws_list_lock);
1540       entry = _mesa_hash_table_search(sws->kms_handles, bo);
1541       simple_mtx_unlock(&ws->sws_list_lock);
1542       if (entry) {
1543          whandle->handle = (uintptr_t)entry->data;
1544          return true;
1545       }
1546       /* Fall through */
1547    case WINSYS_HANDLE_TYPE_FD:
1548       type = amdgpu_bo_handle_type_dma_buf_fd;
1549       break;
1550    default:
1551       return false;
1552    }
1553
1554    r = amdgpu_bo_export(bo->bo, type, &whandle->handle);
1555    if (r)
1556       return false;
1557
1558    if (whandle->type == WINSYS_HANDLE_TYPE_KMS) {
1559       int dma_fd = whandle->handle;
1560
1561       r = drmPrimeFDToHandle(sws->fd, dma_fd, &whandle->handle);
1562       close(dma_fd);
1563
1564       if (r)
1565          return false;
1566
1567       simple_mtx_lock(&ws->sws_list_lock);
1568       _mesa_hash_table_insert_pre_hashed(sws->kms_handles,
1569                                          bo->u.real.kms_handle, bo,
1570                                          (void*)(uintptr_t)whandle->handle);
1571       simple_mtx_unlock(&ws->sws_list_lock);
1572    }
1573
1574  hash_table_set:
1575    simple_mtx_lock(&ws->bo_export_table_lock);
1576    _mesa_hash_table_insert(ws->bo_export_table, bo->bo, bo);
1577    simple_mtx_unlock(&ws->bo_export_table_lock);
1578
1579    bo->is_shared = true;
1580    return true;
1581 }
1582
1583 static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
1584                                             void *pointer, uint64_t size)
1585 {
1586     struct amdgpu_winsys *ws = amdgpu_winsys(rws);
1587     amdgpu_bo_handle buf_handle;
1588     struct amdgpu_winsys_bo *bo;
1589     uint64_t va;
1590     amdgpu_va_handle va_handle;
1591     /* Avoid failure when the size is not page aligned */
1592     uint64_t aligned_size = align64(size, ws->info.gart_page_size);
1593
1594     bo = CALLOC_STRUCT(amdgpu_winsys_bo);
1595     if (!bo)
1596         return NULL;
1597
1598     if (amdgpu_create_bo_from_user_mem(ws->dev, pointer,
1599                                        aligned_size, &buf_handle))
1600         goto error;
1601
1602     if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
1603                               aligned_size,
1604                               amdgpu_get_optimal_vm_alignment(ws, aligned_size,
1605                                                               ws->info.gart_page_size),
1606                               0, &va, &va_handle, AMDGPU_VA_RANGE_HIGH))
1607         goto error_va_alloc;
1608
1609     if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0, AMDGPU_VA_OP_MAP))
1610         goto error_va_map;
1611
1612     /* Initialize it. */
1613     bo->is_user_ptr = true;
1614     pipe_reference_init(&bo->base.reference, 1);
1615     simple_mtx_init(&bo->lock, mtx_plain);
1616     bo->bo = buf_handle;
1617     bo->base.alignment = 0;
1618     bo->base.size = size;
1619     bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
1620     bo->ws = ws;
1621     bo->cpu_ptr = pointer;
1622     bo->va = va;
1623     bo->u.real.va_handle = va_handle;
1624     bo->initial_domain = RADEON_DOMAIN_GTT;
1625     bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
1626
1627     ws->allocated_gtt += aligned_size;
1628
1629     amdgpu_add_buffer_to_global_list(bo);
1630
1631     amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
1632
1633     return (struct pb_buffer*)bo;
1634
1635 error_va_map:
1636     amdgpu_va_range_free(va_handle);
1637
1638 error_va_alloc:
1639     amdgpu_bo_free(buf_handle);
1640
1641 error:
1642     FREE(bo);
1643     return NULL;
1644 }
1645
1646 static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf)
1647 {
1648    return ((struct amdgpu_winsys_bo*)buf)->is_user_ptr;
1649 }
1650
1651 static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf)
1652 {
1653    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
1654
1655    return !bo->bo && !bo->sparse;
1656 }
1657
1658 static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf)
1659 {
1660    return ((struct amdgpu_winsys_bo*)buf)->va;
1661 }
1662
1663 void amdgpu_bo_init_functions(struct amdgpu_screen_winsys *ws)
1664 {
1665    ws->base.buffer_set_metadata = amdgpu_buffer_set_metadata;
1666    ws->base.buffer_get_metadata = amdgpu_buffer_get_metadata;
1667    ws->base.buffer_map = amdgpu_bo_map;
1668    ws->base.buffer_unmap = amdgpu_bo_unmap;
1669    ws->base.buffer_wait = amdgpu_bo_wait;
1670    ws->base.buffer_create = amdgpu_buffer_create;
1671    ws->base.buffer_from_handle = amdgpu_bo_from_handle;
1672    ws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
1673    ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr;
1674    ws->base.buffer_is_suballocated = amdgpu_bo_is_suballocated;
1675    ws->base.buffer_get_handle = amdgpu_bo_get_handle;
1676    ws->base.buffer_commit = amdgpu_bo_sparse_commit;
1677    ws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
1678    ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;
1679    ws->base.buffer_get_flags = amdgpu_bo_get_flags;
1680 }