src/gallium/winsys/amdgpu/drm/amdgpu_bo.c

   1 /*
   2  * Copyright © 2011 Marek Olšák <maraeo@gmail.com>
   3  * Copyright © 2015 Advanced Micro Devices, Inc.
   4  * All Rights Reserved.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining
   7  * a copy of this software and associated documentation files (the
   8  * "Software"), to deal in the Software without restriction, including
   9  * without limitation the rights to use, copy, modify, merge, publish,
  10  * distribute, sub license, and/or sell copies of the Software, and to
  11  * permit persons to whom the Software is furnished to do so, subject to
  12  * the following conditions:
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  16  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
  18  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  20  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * The above copyright notice and this permission notice (including the
  24  * next paragraph) shall be included in all copies or substantial portions
  25  * of the Software.
  26  */
  27
  28 #include "amdgpu_cs.h"
  29
  30 #include "util/os_time.h"
  31 #include "util/u_hash_table.h"
  32 #include "state_tracker/drm_driver.h"
  33 #include <amdgpu_drm.h>
  34 #include <xf86drm.h>
  35 #include <stdio.h>
  36 #include <inttypes.h>
  37
  38 #ifndef AMDGPU_GEM_CREATE_VM_ALWAYS_VALID
  39 #define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6)
  40 #endif
  41
  42 #ifndef AMDGPU_VA_RANGE_HIGH
  43 #define AMDGPU_VA_RANGE_HIGH    0x2
  44 #endif
  45
  46 /* Set to 1 for verbose output showing committed sparse buffer ranges. */
  47 #define DEBUG_SPARSE_COMMITS 0
  48
  49 struct amdgpu_sparse_backing_chunk {
  50    uint32_t begin, end;
  51 };
  52
  53 static struct pb_buffer *
  54 amdgpu_bo_create(struct radeon_winsys *rws,
  55                  uint64_t size,
  56                  unsigned alignment,
  57                  enum radeon_bo_domain domain,
  58                  enum radeon_bo_flag flags);
  59 static void amdgpu_bo_unmap(struct pb_buffer *buf);
  60
  61 static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
  62                            enum radeon_bo_usage usage)
  63 {
  64    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
  65    struct amdgpu_winsys *ws = bo->ws;
  66    int64_t abs_timeout;
  67
  68    if (timeout == 0) {
  69       if (p_atomic_read(&bo->num_active_ioctls))
  70          return false;
  71
  72    } else {
  73       abs_timeout = os_time_get_absolute_timeout(timeout);
  74
  75       /* Wait if any ioctl is being submitted with this buffer. */
  76       if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout))
  77          return false;
  78    }
  79
  80    if (bo->is_shared) {
  81       /* We can't use user fences for shared buffers, because user fences
  82        * are local to this process only. If we want to wait for all buffer
  83        * uses in all processes, we have to use amdgpu_bo_wait_for_idle.
  84        */
  85       bool buffer_busy = true;
  86       int r;
  87
  88       r = amdgpu_bo_wait_for_idle(bo->bo, timeout, &buffer_busy);
  89       if (r)
  90          fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__,
  91                  r);
  92       return !buffer_busy;
  93    }
  94
  95    if (timeout == 0) {
  96       unsigned idle_fences;
  97       bool buffer_idle;
  98
  99       simple_mtx_lock(&ws->bo_fence_lock);
 100
 101       for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) {
 102          if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false))
 103             break;
 104       }
 105
 106       /* Release the idle fences to avoid checking them again later. */
 107       for (unsigned i = 0; i < idle_fences; ++i)
 108          amdgpu_fence_reference(&bo->fences[i], NULL);
 109
 110       memmove(&bo->fences[0], &bo->fences[idle_fences],
 111               (bo->num_fences - idle_fences) * sizeof(*bo->fences));
 112       bo->num_fences -= idle_fences;
 113
 114       buffer_idle = !bo->num_fences;
 115       simple_mtx_unlock(&ws->bo_fence_lock);
 116
 117       return buffer_idle;
 118    } else {
 119       bool buffer_idle = true;
 120
 121       simple_mtx_lock(&ws->bo_fence_lock);
 122       while (bo->num_fences && buffer_idle) {
 123          struct pipe_fence_handle *fence = NULL;
 124          bool fence_idle = false;
 125
 126          amdgpu_fence_reference(&fence, bo->fences[0]);
 127
 128          /* Wait for the fence. */
 129          simple_mtx_unlock(&ws->bo_fence_lock);
 130          if (amdgpu_fence_wait(fence, abs_timeout, true))
 131             fence_idle = true;
 132          else
 133             buffer_idle = false;
 134          simple_mtx_lock(&ws->bo_fence_lock);
 135
 136          /* Release an idle fence to avoid checking it again later, keeping in
 137           * mind that the fence array may have been modified by other threads.
 138           */
 139          if (fence_idle && bo->num_fences && bo->fences[0] == fence) {
 140             amdgpu_fence_reference(&bo->fences[0], NULL);
 141             memmove(&bo->fences[0], &bo->fences[1],
 142                     (bo->num_fences - 1) * sizeof(*bo->fences));
 143             bo->num_fences--;
 144          }
 145
 146          amdgpu_fence_reference(&fence, NULL);
 147       }
 148       simple_mtx_unlock(&ws->bo_fence_lock);
 149
 150       return buffer_idle;
 151    }
 152 }
 153
 154 static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
 155       struct pb_buffer *buf)
 156 {
 157    return ((struct amdgpu_winsys_bo*)buf)->initial_domain;
 158 }
 159
 160 static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo)
 161 {
 162    for (unsigned i = 0; i < bo->num_fences; ++i)
 163       amdgpu_fence_reference(&bo->fences[i], NULL);
 164
 165    FREE(bo->fences);
 166    bo->num_fences = 0;
 167    bo->max_fences = 0;
 168 }
 169
 170 void amdgpu_bo_destroy(struct pb_buffer *_buf)
 171 {
 172    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
 173    struct amdgpu_winsys *ws = bo->ws;
 174
 175    assert(bo->bo && "must not be called for slab entries");
 176
 177    if (!bo->is_user_ptr && bo->cpu_ptr) {
 178       bo->cpu_ptr = NULL;
 179       amdgpu_bo_unmap(&bo->base);
 180    }
 181    assert(bo->is_user_ptr || bo->u.real.map_count == 0);
 182
 183    if (ws->debug_all_bos) {
 184       simple_mtx_lock(&ws->global_bo_list_lock);
 185       LIST_DEL(&bo->u.real.global_list_item);
 186       ws->num_buffers--;
 187       simple_mtx_unlock(&ws->global_bo_list_lock);
 188    }
 189
 190    simple_mtx_lock(&ws->bo_export_table_lock);
 191    util_hash_table_remove(ws->bo_export_table, bo->bo);
 192    simple_mtx_unlock(&ws->bo_export_table_lock);
 193
 194    amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
 195    amdgpu_va_range_free(bo->u.real.va_handle);
 196    amdgpu_bo_free(bo->bo);
 197
 198    amdgpu_bo_remove_fences(bo);
 199
 200    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
 201       ws->allocated_vram -= align64(bo->base.size, ws->info.gart_page_size);
 202    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
 203       ws->allocated_gtt -= align64(bo->base.size, ws->info.gart_page_size);
 204
 205    simple_mtx_destroy(&bo->lock);
 206    FREE(bo);
 207 }
 208
 209 static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf)
 210 {
 211    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
 212
 213    assert(bo->bo); /* slab buffers have a separate vtbl */
 214
 215    if (bo->u.real.use_reusable_pool)
 216       pb_cache_add_buffer(&bo->u.real.cache_entry);
 217    else
 218       amdgpu_bo_destroy(_buf);
 219 }
 220
 221 static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *ws)
 222 {
 223    for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++)
 224       pb_slabs_reclaim(&ws->bo_slabs[i]);
 225
 226    pb_cache_release_all_buffers(&ws->bo_cache);
 227 }
 228
 229 static bool amdgpu_bo_do_map(struct amdgpu_winsys_bo *bo, void **cpu)
 230 {
 231    assert(!bo->sparse && bo->bo && !bo->is_user_ptr);
 232    int r = amdgpu_bo_cpu_map(bo->bo, cpu);
 233    if (r) {
 234       /* Clean up buffer managers and try again. */
 235       amdgpu_clean_up_buffer_managers(bo->ws);
 236       r = amdgpu_bo_cpu_map(bo->bo, cpu);
 237       if (r)
 238          return false;
 239    }
 240
 241    if (p_atomic_inc_return(&bo->u.real.map_count) == 1) {
 242       if (bo->initial_domain & RADEON_DOMAIN_VRAM)
 243          bo->ws->mapped_vram += bo->base.size;
 244       else if (bo->initial_domain & RADEON_DOMAIN_GTT)
 245          bo->ws->mapped_gtt += bo->base.size;
 246       bo->ws->num_mapped_buffers++;
 247    }
 248
 249    return true;
 250 }
 251
 252 static void *amdgpu_bo_map(struct pb_buffer *buf,
 253                            struct radeon_cmdbuf *rcs,
 254                            enum pipe_transfer_usage usage)
 255 {
 256    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
 257    struct amdgpu_winsys_bo *real;
 258    struct amdgpu_cs *cs = (struct amdgpu_cs*)rcs;
 259
 260    assert(!bo->sparse);
 261
 262    /* If it's not unsynchronized bo_map, flush CS if needed and then wait. */
 263    if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
 264       /* DONTBLOCK doesn't make sense with UNSYNCHRONIZED. */
 265       if (usage & PIPE_TRANSFER_DONTBLOCK) {
 266          if (!(usage & PIPE_TRANSFER_WRITE)) {
 267             /* Mapping for read.
 268              *
 269              * Since we are mapping for read, we don't need to wait
 270              * if the GPU is using the buffer for read too
 271              * (neither one is changing it).
 272              *
 273              * Only check whether the buffer is being used for write. */
 274             if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
 275                                                                RADEON_USAGE_WRITE)) {
 276                cs->flush_cs(cs->flush_data,
 277                             RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 278                return NULL;
 279             }
 280
 281             if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
 282                                 RADEON_USAGE_WRITE)) {
 283                return NULL;
 284             }
 285          } else {
 286             if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) {
 287                cs->flush_cs(cs->flush_data,
 288                             RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
 289                return NULL;
 290             }
 291
 292             if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0,
 293                                 RADEON_USAGE_READWRITE)) {
 294                return NULL;
 295             }
 296          }
 297       } else {
 298          uint64_t time = os_time_get_nano();
 299
 300          if (!(usage & PIPE_TRANSFER_WRITE)) {
 301             /* Mapping for read.
 302              *
 303              * Since we are mapping for read, we don't need to wait
 304              * if the GPU is using the buffer for read too
 305              * (neither one is changing it).
 306              *
 307              * Only check whether the buffer is being used for write. */
 308             if (cs) {
 309                if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
 310                                                             RADEON_USAGE_WRITE)) {
 311                   cs->flush_cs(cs->flush_data,
 312                                RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
 313                } else {
 314                   /* Try to avoid busy-waiting in amdgpu_bo_wait. */
 315                   if (p_atomic_read(&bo->num_active_ioctls))
 316                      amdgpu_cs_sync_flush(rcs);
 317                }
 318             }
 319
 320             amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
 321                            RADEON_USAGE_WRITE);
 322          } else {
 323             /* Mapping for write. */
 324             if (cs) {
 325                if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {
 326                   cs->flush_cs(cs->flush_data,
 327                                RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL);
 328                } else {
 329                   /* Try to avoid busy-waiting in amdgpu_bo_wait. */
 330                   if (p_atomic_read(&bo->num_active_ioctls))
 331                      amdgpu_cs_sync_flush(rcs);
 332                }
 333             }
 334
 335             amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
 336                            RADEON_USAGE_READWRITE);
 337          }
 338
 339          bo->ws->buffer_wait_time += os_time_get_nano() - time;
 340       }
 341    }
 342
 343    /* Buffer synchronization has been checked, now actually map the buffer. */
 344    void *cpu = NULL;
 345    uint64_t offset = 0;
 346
 347    if (bo->bo) {
 348       real = bo;
 349    } else {
 350       real = bo->u.slab.real;
 351       offset = bo->va - real->va;
 352    }
 353
 354    if (usage & RADEON_TRANSFER_TEMPORARY) {
 355       if (real->is_user_ptr) {
 356          cpu = real->cpu_ptr;
 357       } else {
 358          if (!amdgpu_bo_do_map(real, &cpu))
 359             return NULL;
 360       }
 361    } else {
 362       cpu = p_atomic_read(&real->cpu_ptr);
 363       if (!cpu) {
 364          simple_mtx_lock(&real->lock);
 365          /* Must re-check due to the possibility of a race. Re-check need not
 366           * be atomic thanks to the lock. */
 367          cpu = real->cpu_ptr;
 368          if (!cpu) {
 369             if (!amdgpu_bo_do_map(real, &cpu)) {
 370                simple_mtx_unlock(&real->lock);
 371                return NULL;
 372             }
 373             p_atomic_set(&real->cpu_ptr, cpu);
 374          }
 375          simple_mtx_unlock(&real->lock);
 376       }
 377    }
 378
 379    return (uint8_t*)cpu + offset;
 380 }
 381
 382 static void amdgpu_bo_unmap(struct pb_buffer *buf)
 383 {
 384    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
 385    struct amdgpu_winsys_bo *real;
 386
 387    assert(!bo->sparse);
 388
 389    if (bo->is_user_ptr)
 390       return;
 391
 392    real = bo->bo ? bo : bo->u.slab.real;
 393    assert(real->u.real.map_count != 0 && "too many unmaps");
 394    if (p_atomic_dec_zero(&real->u.real.map_count)) {
 395       assert(!real->cpu_ptr &&
 396              "too many unmaps or forgot RADEON_TRANSFER_TEMPORARY flag");
 397
 398       if (real->initial_domain & RADEON_DOMAIN_VRAM)
 399          real->ws->mapped_vram -= real->base.size;
 400       else if (real->initial_domain & RADEON_DOMAIN_GTT)
 401          real->ws->mapped_gtt -= real->base.size;
 402       real->ws->num_mapped_buffers--;
 403    }
 404
 405    amdgpu_bo_cpu_unmap(real->bo);
 406 }
 407
 408 static const struct pb_vtbl amdgpu_winsys_bo_vtbl = {
 409    amdgpu_bo_destroy_or_cache
 410    /* other functions are never called */
 411 };
 412
 413 static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo)
 414 {
 415    struct amdgpu_winsys *ws = bo->ws;
 416
 417    assert(bo->bo);
 418
 419    if (ws->debug_all_bos) {
 420       simple_mtx_lock(&ws->global_bo_list_lock);
 421       LIST_ADDTAIL(&bo->u.real.global_list_item, &ws->global_bo_list);
 422       ws->num_buffers++;
 423       simple_mtx_unlock(&ws->global_bo_list_lock);
 424    }
 425 }
 426
 427 static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
 428                                                  uint64_t size,
 429                                                  unsigned alignment,
 430                                                  enum radeon_bo_domain initial_domain,
 431                                                  unsigned flags,
 432                                                  int heap)
 433 {
 434    struct amdgpu_bo_alloc_request request = {0};
 435    amdgpu_bo_handle buf_handle;
 436    uint64_t va = 0;
 437    struct amdgpu_winsys_bo *bo;
 438    amdgpu_va_handle va_handle;
 439    unsigned va_gap_size;
 440    int r;
 441
 442    /* VRAM or GTT must be specified, but not both at the same time. */
 443    assert(util_bitcount(initial_domain & RADEON_DOMAIN_VRAM_GTT) == 1);
 444
 445    bo = CALLOC_STRUCT(amdgpu_winsys_bo);
 446    if (!bo) {
 447       return NULL;
 448    }
 449
 450    if (heap >= 0) {
 451       pb_cache_init_entry(&ws->bo_cache, &bo->u.real.cache_entry, &bo->base,
 452                           heap);
 453    }
 454    request.alloc_size = size;
 455    request.phys_alignment = alignment;
 456
 457    if (initial_domain & RADEON_DOMAIN_VRAM)
 458       request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
 459    if (initial_domain & RADEON_DOMAIN_GTT)
 460       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
 461
 462    /* Since VRAM and GTT have almost the same performance on APUs, we could
 463     * just set GTT. However, in order to decrease GTT(RAM) usage, which is
 464     * shared with the OS, allow VRAM placements too. The idea is not to use
 465     * VRAM usefully, but to use it so that it's not unused and wasted.
 466     */
 467    if (!ws->info.has_dedicated_vram)
 468       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
 469
 470    if (flags & RADEON_FLAG_NO_CPU_ACCESS)
 471       request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
 472    if (flags & RADEON_FLAG_GTT_WC)
 473       request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
 474    if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
 475        ws->info.has_local_buffers)
 476       request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID;
 477    if (ws->zero_all_vram_allocs &&
 478        (request.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM))
 479       request.flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED;
 480
 481    r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
 482    if (r) {
 483       fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
 484       fprintf(stderr, "amdgpu:    size      : %"PRIu64" bytes\n", size);
 485       fprintf(stderr, "amdgpu:    alignment : %u bytes\n", alignment);
 486       fprintf(stderr, "amdgpu:    domains   : %u\n", initial_domain);
 487       goto error_bo_alloc;
 488    }
 489
 490    va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
 491    if (size > ws->info.pte_fragment_size)
 492            alignment = MAX2(alignment, ws->info.pte_fragment_size);
 493    r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
 494                              size + va_gap_size, alignment, 0, &va, &va_handle,
 495                              (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
 496                              AMDGPU_VA_RANGE_HIGH);
 497    if (r)
 498       goto error_va_alloc;
 499
 500    unsigned vm_flags = AMDGPU_VM_PAGE_READABLE |
 501                        AMDGPU_VM_PAGE_EXECUTABLE;
 502
 503    if (!(flags & RADEON_FLAG_READ_ONLY))
 504        vm_flags |= AMDGPU_VM_PAGE_WRITEABLE;
 505
 506    r = amdgpu_bo_va_op_raw(ws->dev, buf_handle, 0, size, va, vm_flags,
 507                            AMDGPU_VA_OP_MAP);
 508    if (r)
 509       goto error_va_map;
 510
 511    simple_mtx_init(&bo->lock, mtx_plain);
 512    pipe_reference_init(&bo->base.reference, 1);
 513    bo->base.alignment = alignment;
 514    bo->base.usage = 0;
 515    bo->base.size = size;
 516    bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
 517    bo->ws = ws;
 518    bo->bo = buf_handle;
 519    bo->va = va;
 520    bo->u.real.va_handle = va_handle;
 521    bo->initial_domain = initial_domain;
 522    bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
 523    bo->is_local = !!(request.flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID);
 524
 525    if (initial_domain & RADEON_DOMAIN_VRAM)
 526       ws->allocated_vram += align64(size, ws->info.gart_page_size);
 527    else if (initial_domain & RADEON_DOMAIN_GTT)
 528       ws->allocated_gtt += align64(size, ws->info.gart_page_size);
 529
 530    amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
 531
 532    amdgpu_add_buffer_to_global_list(bo);
 533
 534    return bo;
 535
 536 error_va_map:
 537    amdgpu_va_range_free(va_handle);
 538
 539 error_va_alloc:
 540    amdgpu_bo_free(buf_handle);
 541
 542 error_bo_alloc:
 543    FREE(bo);
 544    return NULL;
 545 }
 546
 547 bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf)
 548 {
 549    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
 550
 551    if (amdgpu_bo_is_referenced_by_any_cs(bo)) {
 552       return false;
 553    }
 554
 555    return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE);
 556 }
 557
 558 bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
 559 {
 560    struct amdgpu_winsys_bo *bo = NULL; /* fix container_of */
 561    bo = container_of(entry, bo, u.slab.entry);
 562
 563    return amdgpu_bo_can_reclaim(&bo->base);
 564 }
 565
 566 static struct pb_slabs *get_slabs(struct amdgpu_winsys *ws, uint64_t size)
 567 {
 568    /* Find the correct slab allocator for the given size. */
 569    for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
 570       struct pb_slabs *slabs = &ws->bo_slabs[i];
 571
 572       if (size <= 1 << (slabs->min_order + slabs->num_orders - 1))
 573          return slabs;
 574    }
 575
 576    assert(0);
 577    return NULL;
 578 }
 579
 580 static void amdgpu_bo_slab_destroy(struct pb_buffer *_buf)
 581 {
 582    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
 583
 584    assert(!bo->bo);
 585
 586    pb_slab_free(get_slabs(bo->ws, bo->base.size), &bo->u.slab.entry);
 587 }
 588
 589 static const struct pb_vtbl amdgpu_winsys_bo_slab_vtbl = {
 590    amdgpu_bo_slab_destroy
 591    /* other functions are never called */
 592 };
 593
 594 struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap,
 595                                      unsigned entry_size,
 596                                      unsigned group_index)
 597 {
 598    struct amdgpu_winsys *ws = priv;
 599    struct amdgpu_slab *slab = CALLOC_STRUCT(amdgpu_slab);
 600    enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
 601    enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
 602    uint32_t base_id;
 603    unsigned slab_size = 0;
 604
 605    if (!slab)
 606       return NULL;
 607
 608    /* Determine the slab buffer size. */
 609    for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
 610       struct pb_slabs *slabs = &ws->bo_slabs[i];
 611       unsigned max_entry_size = 1 << (slabs->min_order + slabs->num_orders - 1);
 612
 613       if (entry_size <= max_entry_size) {
 614          /* The slab size is twice the size of the largest possible entry. */
 615          slab_size = max_entry_size * 2;
 616       }
 617    }
 618    assert(slab_size != 0);
 619
 620    slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(&ws->base,
 621                                                     slab_size, slab_size,
 622                                                     domains, flags));
 623    if (!slab->buffer)
 624       goto fail;
 625
 626    slab->base.num_entries = slab->buffer->base.size / entry_size;
 627    slab->base.num_free = slab->base.num_entries;
 628    slab->entries = CALLOC(slab->base.num_entries, sizeof(*slab->entries));
 629    if (!slab->entries)
 630       goto fail_buffer;
 631
 632    LIST_INITHEAD(&slab->base.free);
 633
 634    base_id = __sync_fetch_and_add(&ws->next_bo_unique_id, slab->base.num_entries);
 635
 636    for (unsigned i = 0; i < slab->base.num_entries; ++i) {
 637       struct amdgpu_winsys_bo *bo = &slab->entries[i];
 638
 639       simple_mtx_init(&bo->lock, mtx_plain);
 640       bo->base.alignment = entry_size;
 641       bo->base.usage = slab->buffer->base.usage;
 642       bo->base.size = entry_size;
 643       bo->base.vtbl = &amdgpu_winsys_bo_slab_vtbl;
 644       bo->ws = ws;
 645       bo->va = slab->buffer->va + i * entry_size;
 646       bo->initial_domain = domains;
 647       bo->unique_id = base_id + i;
 648       bo->u.slab.entry.slab = &slab->base;
 649       bo->u.slab.entry.group_index = group_index;
 650
 651       if (slab->buffer->bo) {
 652          /* The slab is not suballocated. */
 653          bo->u.slab.real = slab->buffer;
 654       } else {
 655          /* The slab is allocated out of a bigger slab. */
 656          bo->u.slab.real = slab->buffer->u.slab.real;
 657          assert(bo->u.slab.real->bo);
 658       }
 659
 660       LIST_ADDTAIL(&bo->u.slab.entry.head, &slab->base.free);
 661    }
 662
 663    return &slab->base;
 664
 665 fail_buffer:
 666    amdgpu_winsys_bo_reference(&slab->buffer, NULL);
 667 fail:
 668    FREE(slab);
 669    return NULL;
 670 }
 671
 672 void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab)
 673 {
 674    struct amdgpu_slab *slab = amdgpu_slab(pslab);
 675
 676    for (unsigned i = 0; i < slab->base.num_entries; ++i) {
 677       amdgpu_bo_remove_fences(&slab->entries[i]);
 678       simple_mtx_destroy(&slab->entries[i].lock);
 679    }
 680
 681    FREE(slab->entries);
 682    amdgpu_winsys_bo_reference(&slab->buffer, NULL);
 683    FREE(slab);
 684 }
 685
 686 #if DEBUG_SPARSE_COMMITS
 687 static void
 688 sparse_dump(struct amdgpu_winsys_bo *bo, const char *func)
 689 {
 690    fprintf(stderr, "%s: %p (size=%"PRIu64", num_va_pages=%u) @ %s\n"
 691                    "Commitments:\n",
 692            __func__, bo, bo->base.size, bo->u.sparse.num_va_pages, func);
 693
 694    struct amdgpu_sparse_backing *span_backing = NULL;
 695    uint32_t span_first_backing_page = 0;
 696    uint32_t span_first_va_page = 0;
 697    uint32_t va_page = 0;
 698
 699    for (;;) {
 700       struct amdgpu_sparse_backing *backing = 0;
 701       uint32_t backing_page = 0;
 702
 703       if (va_page < bo->u.sparse.num_va_pages) {
 704          backing = bo->u.sparse.commitments[va_page].backing;
 705          backing_page = bo->u.sparse.commitments[va_page].page;
 706       }
 707
 708       if (span_backing &&
 709           (backing != span_backing ||
 710            backing_page != span_first_backing_page + (va_page - span_first_va_page))) {
 711          fprintf(stderr, " %u..%u: backing=%p:%u..%u\n",
 712                  span_first_va_page, va_page - 1, span_backing,
 713                  span_first_backing_page,
 714                  span_first_backing_page + (va_page - span_first_va_page) - 1);
 715
 716          span_backing = NULL;
 717       }
 718
 719       if (va_page >= bo->u.sparse.num_va_pages)
 720          break;
 721
 722       if (backing && !span_backing) {
 723          span_backing = backing;
 724          span_first_backing_page = backing_page;
 725          span_first_va_page = va_page;
 726       }
 727
 728       va_page++;
 729    }
 730
 731    fprintf(stderr, "Backing:\n");
 732
 733    list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
 734       fprintf(stderr, " %p (size=%"PRIu64")\n", backing, backing->bo->base.size);
 735       for (unsigned i = 0; i < backing->num_chunks; ++i)
 736          fprintf(stderr, "   %u..%u\n", backing->chunks[i].begin, backing->chunks[i].end);
 737    }
 738 }
 739 #endif
 740
 741 /*
 742  * Attempt to allocate the given number of backing pages. Fewer pages may be
 743  * allocated (depending on the fragmentation of existing backing buffers),
 744  * which will be reflected by a change to *pnum_pages.
 745  */
 746 static struct amdgpu_sparse_backing *
 747 sparse_backing_alloc(struct amdgpu_winsys_bo *bo, uint32_t *pstart_page, uint32_t *pnum_pages)
 748 {
 749    struct amdgpu_sparse_backing *best_backing;
 750    unsigned best_idx;
 751    uint32_t best_num_pages;
 752
 753    best_backing = NULL;
 754    best_idx = 0;
 755    best_num_pages = 0;
 756
 757    /* This is a very simple and inefficient best-fit algorithm. */
 758    list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
 759       for (unsigned idx = 0; idx < backing->num_chunks; ++idx) {
 760          uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin;
 761          if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) ||
 762             (best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) {
 763             best_backing = backing;
 764             best_idx = idx;
 765             best_num_pages = cur_num_pages;
 766          }
 767       }
 768    }
 769
 770    /* Allocate a new backing buffer if necessary. */
 771    if (!best_backing) {
 772       struct pb_buffer *buf;
 773       uint64_t size;
 774       uint32_t pages;
 775
 776       best_backing = CALLOC_STRUCT(amdgpu_sparse_backing);
 777       if (!best_backing)
 778          return NULL;
 779
 780       best_backing->max_chunks = 4;
 781       best_backing->chunks = CALLOC(best_backing->max_chunks,
 782                                     sizeof(*best_backing->chunks));
 783       if (!best_backing->chunks) {
 784          FREE(best_backing);
 785          return NULL;
 786       }
 787
 788       assert(bo->u.sparse.num_backing_pages < DIV_ROUND_UP(bo->base.size, RADEON_SPARSE_PAGE_SIZE));
 789
 790       size = MIN3(bo->base.size / 16,
 791                   8 * 1024 * 1024,
 792                   bo->base.size - (uint64_t)bo->u.sparse.num_backing_pages * RADEON_SPARSE_PAGE_SIZE);
 793       size = MAX2(size, RADEON_SPARSE_PAGE_SIZE);
 794
 795       buf = amdgpu_bo_create(&bo->ws->base, size, RADEON_SPARSE_PAGE_SIZE,
 796                              bo->initial_domain,
 797                              bo->u.sparse.flags | RADEON_FLAG_NO_SUBALLOC);
 798       if (!buf) {
 799          FREE(best_backing->chunks);
 800          FREE(best_backing);
 801          return NULL;
 802       }
 803
 804       /* We might have gotten a bigger buffer than requested via caching. */
 805       pages = buf->size / RADEON_SPARSE_PAGE_SIZE;
 806
 807       best_backing->bo = amdgpu_winsys_bo(buf);
 808       best_backing->num_chunks = 1;
 809       best_backing->chunks[0].begin = 0;
 810       best_backing->chunks[0].end = pages;
 811
 812       list_add(&best_backing->list, &bo->u.sparse.backing);
 813       bo->u.sparse.num_backing_pages += pages;
 814
 815       best_idx = 0;
 816       best_num_pages = pages;
 817    }
 818
 819    *pnum_pages = MIN2(*pnum_pages, best_num_pages);
 820    *pstart_page = best_backing->chunks[best_idx].begin;
 821    best_backing->chunks[best_idx].begin += *pnum_pages;
 822
 823    if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) {
 824       memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1],
 825               sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1));
 826       best_backing->num_chunks--;
 827    }
 828
 829    return best_backing;
 830 }
 831
 832 static void
 833 sparse_free_backing_buffer(struct amdgpu_winsys_bo *bo,
 834                            struct amdgpu_sparse_backing *backing)
 835 {
 836    struct amdgpu_winsys *ws = backing->bo->ws;
 837
 838    bo->u.sparse.num_backing_pages -= backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE;
 839
 840    simple_mtx_lock(&ws->bo_fence_lock);
 841    amdgpu_add_fences(backing->bo, bo->num_fences, bo->fences);
 842    simple_mtx_unlock(&ws->bo_fence_lock);
 843
 844    list_del(&backing->list);
 845    amdgpu_winsys_bo_reference(&backing->bo, NULL);
 846    FREE(backing->chunks);
 847    FREE(backing);
 848 }
 849
 850 /*
 851  * Return a range of pages from the given backing buffer back into the
 852  * free structure.
 853  */
 854 static bool
 855 sparse_backing_free(struct amdgpu_winsys_bo *bo,
 856                     struct amdgpu_sparse_backing *backing,
 857                     uint32_t start_page, uint32_t num_pages)
 858 {
 859    uint32_t end_page = start_page + num_pages;
 860    unsigned low = 0;
 861    unsigned high = backing->num_chunks;
 862
 863    /* Find the first chunk with begin >= start_page. */
 864    while (low < high) {
 865       unsigned mid = low + (high - low) / 2;
 866
 867       if (backing->chunks[mid].begin >= start_page)
 868          high = mid;
 869       else
 870          low = mid + 1;
 871    }
 872
 873    assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin);
 874    assert(low == 0 || backing->chunks[low - 1].end <= start_page);
 875
 876    if (low > 0 && backing->chunks[low - 1].end == start_page) {
 877       backing->chunks[low - 1].end = end_page;
 878
 879       if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
 880          backing->chunks[low - 1].end = backing->chunks[low].end;
 881          memmove(&backing->chunks[low], &backing->chunks[low + 1],
 882                  sizeof(*backing->chunks) * (backing->num_chunks - low - 1));
 883          backing->num_chunks--;
 884       }
 885    } else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
 886       backing->chunks[low].begin = start_page;
 887    } else {
 888       if (backing->num_chunks >= backing->max_chunks) {
 889          unsigned new_max_chunks = 2 * backing->max_chunks;
 890          struct amdgpu_sparse_backing_chunk *new_chunks =
 891             REALLOC(backing->chunks,
 892                     sizeof(*backing->chunks) * backing->max_chunks,
 893                     sizeof(*backing->chunks) * new_max_chunks);
 894          if (!new_chunks)
 895             return false;
 896
 897          backing->max_chunks = new_max_chunks;
 898          backing->chunks = new_chunks;
 899       }
 900
 901       memmove(&backing->chunks[low + 1], &backing->chunks[low],
 902               sizeof(*backing->chunks) * (backing->num_chunks - low));
 903       backing->chunks[low].begin = start_page;
 904       backing->chunks[low].end = end_page;
 905       backing->num_chunks++;
 906    }
 907
 908    if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 &&
 909        backing->chunks[0].end == backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE)
 910       sparse_free_backing_buffer(bo, backing);
 911
 912    return true;
 913 }
 914
 915 static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf)
 916 {
 917    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
 918    int r;
 919
 920    assert(!bo->bo && bo->sparse);
 921
 922    r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0,
 923                            (uint64_t)bo->u.sparse.num_va_pages * RADEON_SPARSE_PAGE_SIZE,
 924                            bo->va, 0, AMDGPU_VA_OP_CLEAR);
 925    if (r) {
 926       fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r);
 927    }
 928
 929    while (!list_empty(&bo->u.sparse.backing)) {
 930       struct amdgpu_sparse_backing *dummy = NULL;
 931       sparse_free_backing_buffer(bo,
 932                                  container_of(bo->u.sparse.backing.next,
 933                                               dummy, list));
 934    }
 935
 936    amdgpu_va_range_free(bo->u.sparse.va_handle);
 937    FREE(bo->u.sparse.commitments);
 938    simple_mtx_destroy(&bo->lock);
 939    FREE(bo);
 940 }
 941
 942 static const struct pb_vtbl amdgpu_winsys_bo_sparse_vtbl = {
 943    amdgpu_bo_sparse_destroy
 944    /* other functions are never called */
 945 };
 946
 947 static struct pb_buffer *
 948 amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size,
 949                         enum radeon_bo_domain domain,
 950                         enum radeon_bo_flag flags)
 951 {
 952    struct amdgpu_winsys_bo *bo;
 953    uint64_t map_size;
 954    uint64_t va_gap_size;
 955    int r;
 956
 957    /* We use 32-bit page numbers; refuse to attempt allocating sparse buffers
 958     * that exceed this limit. This is not really a restriction: we don't have
 959     * that much virtual address space anyway.
 960     */
 961    if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE)
 962       return NULL;
 963
 964    bo = CALLOC_STRUCT(amdgpu_winsys_bo);
 965    if (!bo)
 966       return NULL;
 967
 968    simple_mtx_init(&bo->lock, mtx_plain);
 969    pipe_reference_init(&bo->base.reference, 1);
 970    bo->base.alignment = RADEON_SPARSE_PAGE_SIZE;
 971    bo->base.size = size;
 972    bo->base.vtbl = &amdgpu_winsys_bo_sparse_vtbl;
 973    bo->ws = ws;
 974    bo->initial_domain = domain;
 975    bo->unique_id =  __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
 976    bo->sparse = true;
 977    bo->u.sparse.flags = flags & ~RADEON_FLAG_SPARSE;
 978
 979    bo->u.sparse.num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
 980    bo->u.sparse.commitments = CALLOC(bo->u.sparse.num_va_pages,
 981                                      sizeof(*bo->u.sparse.commitments));
 982    if (!bo->u.sparse.commitments)
 983       goto error_alloc_commitments;
 984
 985    LIST_INITHEAD(&bo->u.sparse.backing);
 986
 987    /* For simplicity, we always map a multiple of the page size. */
 988    map_size = align64(size, RADEON_SPARSE_PAGE_SIZE);
 989    va_gap_size = ws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0;
 990    r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
 991                              map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE,
 992                              0, &bo->va, &bo->u.sparse.va_handle,
 993                              AMDGPU_VA_RANGE_HIGH);
 994    if (r)
 995       goto error_va_alloc;
 996
 997    r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, size, bo->va,
 998                            AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP);
 999    if (r)
1000       goto error_va_map;
1001
1002    return &bo->base;
1003
1004 error_va_map:
1005    amdgpu_va_range_free(bo->u.sparse.va_handle);
1006 error_va_alloc:
1007    FREE(bo->u.sparse.commitments);
1008 error_alloc_commitments:
1009    simple_mtx_destroy(&bo->lock);
1010    FREE(bo);
1011    return NULL;
1012 }
1013
1014 static bool
1015 amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size,
1016                         bool commit)
1017 {
1018    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf);
1019    struct amdgpu_sparse_commitment *comm;
1020    uint32_t va_page, end_va_page;
1021    bool ok = true;
1022    int r;
1023
1024    assert(bo->sparse);
1025    assert(offset % RADEON_SPARSE_PAGE_SIZE == 0);
1026    assert(offset <= bo->base.size);
1027    assert(size <= bo->base.size - offset);
1028    assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->base.size);
1029
1030    comm = bo->u.sparse.commitments;
1031    va_page = offset / RADEON_SPARSE_PAGE_SIZE;
1032    end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
1033
1034    simple_mtx_lock(&bo->lock);
1035
1036 #if DEBUG_SPARSE_COMMITS
1037    sparse_dump(bo, __func__);
1038 #endif
1039
1040    if (commit) {
1041       while (va_page < end_va_page) {
1042          uint32_t span_va_page;
1043
1044          /* Skip pages that are already committed. */
1045          if (comm[va_page].backing) {
1046             va_page++;
1047             continue;
1048          }
1049
1050          /* Determine length of uncommitted span. */
1051          span_va_page = va_page;
1052          while (va_page < end_va_page && !comm[va_page].backing)
1053             va_page++;
1054
1055          /* Fill the uncommitted span with chunks of backing memory. */
1056          while (span_va_page < va_page) {
1057             struct amdgpu_sparse_backing *backing;
1058             uint32_t backing_start, backing_size;
1059
1060             backing_size = va_page - span_va_page;
1061             backing = sparse_backing_alloc(bo, &backing_start, &backing_size);
1062             if (!backing) {
1063                ok = false;
1064                goto out;
1065             }
1066
1067             r = amdgpu_bo_va_op_raw(bo->ws->dev, backing->bo->bo,
1068                                     (uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE,
1069                                     (uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE,
1070                                     bo->va + (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE,
1071                                     AMDGPU_VM_PAGE_READABLE |
1072                                     AMDGPU_VM_PAGE_WRITEABLE |
1073                                     AMDGPU_VM_PAGE_EXECUTABLE,
1074                                     AMDGPU_VA_OP_REPLACE);
1075             if (r) {
1076                ok = sparse_backing_free(bo, backing, backing_start, backing_size);
1077                assert(ok && "sufficient memory should already be allocated");
1078
1079                ok = false;
1080                goto out;
1081             }
1082
1083             while (backing_size) {
1084                comm[span_va_page].backing = backing;
1085                comm[span_va_page].page = backing_start;
1086                span_va_page++;
1087                backing_start++;
1088                backing_size--;
1089             }
1090          }
1091       }
1092    } else {
1093       r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0,
1094                               (uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE,
1095                               bo->va + (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE,
1096                               AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE);
1097       if (r) {
1098          ok = false;
1099          goto out;
1100       }
1101
1102       while (va_page < end_va_page) {
1103          struct amdgpu_sparse_backing *backing;
1104          uint32_t backing_start;
1105          uint32_t span_pages;
1106
1107          /* Skip pages that are already uncommitted. */
1108          if (!comm[va_page].backing) {
1109             va_page++;
1110             continue;
1111          }
1112
1113          /* Group contiguous spans of pages. */
1114          backing = comm[va_page].backing;
1115          backing_start = comm[va_page].page;
1116          comm[va_page].backing = NULL;
1117
1118          span_pages = 1;
1119          va_page++;
1120
1121          while (va_page < end_va_page &&
1122                 comm[va_page].backing == backing &&
1123                 comm[va_page].page == backing_start + span_pages) {
1124             comm[va_page].backing = NULL;
1125             va_page++;
1126             span_pages++;
1127          }
1128
1129          if (!sparse_backing_free(bo, backing, backing_start, span_pages)) {
1130             /* Couldn't allocate tracking data structures, so we have to leak */
1131             fprintf(stderr, "amdgpu: leaking PRT backing memory\n");
1132             ok = false;
1133          }
1134       }
1135    }
1136 out:
1137
1138    simple_mtx_unlock(&bo->lock);
1139
1140    return ok;
1141 }
1142
1143 static unsigned eg_tile_split(unsigned tile_split)
1144 {
1145    switch (tile_split) {
1146    case 0:     tile_split = 64;    break;
1147    case 1:     tile_split = 128;   break;
1148    case 2:     tile_split = 256;   break;
1149    case 3:     tile_split = 512;   break;
1150    default:
1151    case 4:     tile_split = 1024;  break;
1152    case 5:     tile_split = 2048;  break;
1153    case 6:     tile_split = 4096;  break;
1154    }
1155    return tile_split;
1156 }
1157
1158 static unsigned eg_tile_split_rev(unsigned eg_tile_split)
1159 {
1160    switch (eg_tile_split) {
1161    case 64:    return 0;
1162    case 128:   return 1;
1163    case 256:   return 2;
1164    case 512:   return 3;
1165    default:
1166    case 1024:  return 4;
1167    case 2048:  return 5;
1168    case 4096:  return 6;
1169    }
1170 }
1171
1172 static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf,
1173                                        struct radeon_bo_metadata *md)
1174 {
1175    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
1176    struct amdgpu_bo_info info = {0};
1177    uint64_t tiling_flags;
1178    int r;
1179
1180    assert(bo->bo && "must not be called for slab entries");
1181
1182    r = amdgpu_bo_query_info(bo->bo, &info);
1183    if (r)
1184       return;
1185
1186    tiling_flags = info.metadata.tiling_info;
1187
1188    if (bo->ws->info.chip_class >= GFX9) {
1189       md->u.gfx9.swizzle_mode = AMDGPU_TILING_GET(tiling_flags, SWIZZLE_MODE);
1190    } else {
1191       md->u.legacy.microtile = RADEON_LAYOUT_LINEAR;
1192       md->u.legacy.macrotile = RADEON_LAYOUT_LINEAR;
1193
1194       if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4)  /* 2D_TILED_THIN1 */
1195          md->u.legacy.macrotile = RADEON_LAYOUT_TILED;
1196       else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */
1197          md->u.legacy.microtile = RADEON_LAYOUT_TILED;
1198
1199       md->u.legacy.pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG);
1200       md->u.legacy.bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH);
1201       md->u.legacy.bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT);
1202       md->u.legacy.tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT));
1203       md->u.legacy.mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT);
1204       md->u.legacy.num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS);
1205       md->u.legacy.scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */
1206    }
1207
1208    md->size_metadata = info.metadata.size_metadata;
1209    memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata));
1210 }
1211
1212 static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
1213                                        struct radeon_bo_metadata *md)
1214 {
1215    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
1216    struct amdgpu_bo_metadata metadata = {0};
1217    uint64_t tiling_flags = 0;
1218
1219    assert(bo->bo && "must not be called for slab entries");
1220
1221    if (bo->ws->info.chip_class >= GFX9) {
1222       tiling_flags |= AMDGPU_TILING_SET(SWIZZLE_MODE, md->u.gfx9.swizzle_mode);
1223    } else {
1224       if (md->u.legacy.macrotile == RADEON_LAYOUT_TILED)
1225          tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */
1226       else if (md->u.legacy.microtile == RADEON_LAYOUT_TILED)
1227          tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */
1228       else
1229          tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */
1230
1231       tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->u.legacy.pipe_config);
1232       tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->u.legacy.bankw));
1233       tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->u.legacy.bankh));
1234       if (md->u.legacy.tile_split)
1235          tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(md->u.legacy.tile_split));
1236       tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->u.legacy.mtilea));
1237       tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->u.legacy.num_banks)-1);
1238
1239       if (md->u.legacy.scanout)
1240          tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */
1241       else
1242          tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */
1243    }
1244
1245    metadata.tiling_info = tiling_flags;
1246    metadata.size_metadata = md->size_metadata;
1247    memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata));
1248
1249    amdgpu_bo_set_metadata(bo->bo, &metadata);
1250 }
1251
1252 static struct pb_buffer *
1253 amdgpu_bo_create(struct radeon_winsys *rws,
1254                  uint64_t size,
1255                  unsigned alignment,
1256                  enum radeon_bo_domain domain,
1257                  enum radeon_bo_flag flags)
1258 {
1259    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
1260    struct amdgpu_winsys_bo *bo;
1261    int heap = -1;
1262
1263    /* VRAM implies WC. This is not optional. */
1264    assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
1265
1266    /* NO_CPU_ACCESS is valid with VRAM only. */
1267    assert(domain == RADEON_DOMAIN_VRAM || !(flags & RADEON_FLAG_NO_CPU_ACCESS));
1268
1269    /* Sparse buffers must have NO_CPU_ACCESS set. */
1270    assert(!(flags & RADEON_FLAG_SPARSE) || flags & RADEON_FLAG_NO_CPU_ACCESS);
1271
1272    struct pb_slabs *last_slab = &ws->bo_slabs[NUM_SLAB_ALLOCATORS - 1];
1273    unsigned max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1);
1274
1275    /* Sub-allocate small buffers from slabs. */
1276    if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&
1277        size <= max_slab_entry_size &&
1278        /* The alignment must be at most the size of the smallest slab entry or
1279         * the next power of two. */
1280        alignment <= MAX2(1 << ws->bo_slabs[0].min_order, util_next_power_of_two(size))) {
1281       struct pb_slab_entry *entry;
1282       int heap = radeon_get_heap_index(domain, flags);
1283
1284       if (heap < 0 || heap >= RADEON_MAX_SLAB_HEAPS)
1285          goto no_slab;
1286
1287       struct pb_slabs *slabs = get_slabs(ws, size);
1288       entry = pb_slab_alloc(slabs, size, heap);
1289       if (!entry) {
1290          /* Clean up buffer managers and try again. */
1291          amdgpu_clean_up_buffer_managers(ws);
1292
1293          entry = pb_slab_alloc(slabs, size, heap);
1294       }
1295       if (!entry)
1296          return NULL;
1297
1298       bo = NULL;
1299       bo = container_of(entry, bo, u.slab.entry);
1300
1301       pipe_reference_init(&bo->base.reference, 1);
1302
1303       return &bo->base;
1304    }
1305 no_slab:
1306
1307    if (flags & RADEON_FLAG_SPARSE) {
1308       assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0);
1309
1310       return amdgpu_bo_sparse_create(ws, size, domain, flags);
1311    }
1312
1313    /* This flag is irrelevant for the cache. */
1314    flags &= ~RADEON_FLAG_NO_SUBALLOC;
1315
1316    /* Align size to page size. This is the minimum alignment for normal
1317     * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
1318     * like constant/uniform buffers, can benefit from better and more reuse.
1319     */
1320    size = align64(size, ws->info.gart_page_size);
1321    alignment = align(alignment, ws->info.gart_page_size);
1322
1323    bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
1324
1325    if (use_reusable_pool) {
1326        heap = radeon_get_heap_index(domain, flags);
1327        assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
1328
1329        /* Get a buffer from the cache. */
1330        bo = (struct amdgpu_winsys_bo*)
1331             pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, 0, heap);
1332        if (bo)
1333           return &bo->base;
1334    }
1335
1336    /* Create a new one. */
1337    bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap);
1338    if (!bo) {
1339       /* Clean up buffer managers and try again. */
1340       amdgpu_clean_up_buffer_managers(ws);
1341
1342       bo = amdgpu_create_bo(ws, size, alignment, domain, flags, heap);
1343       if (!bo)
1344          return NULL;
1345    }
1346
1347    bo->u.real.use_reusable_pool = use_reusable_pool;
1348    return &bo->base;
1349 }
1350
1351 static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
1352                                                struct winsys_handle *whandle,
1353                                                unsigned *stride,
1354                                                unsigned *offset)
1355 {
1356    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
1357    struct amdgpu_winsys_bo *bo = NULL;
1358    enum amdgpu_bo_handle_type type;
1359    struct amdgpu_bo_import_result result = {0};
1360    uint64_t va;
1361    amdgpu_va_handle va_handle = NULL;
1362    struct amdgpu_bo_info info = {0};
1363    enum radeon_bo_domain initial = 0;
1364    int r;
1365
1366    switch (whandle->type) {
1367    case WINSYS_HANDLE_TYPE_SHARED:
1368       type = amdgpu_bo_handle_type_gem_flink_name;
1369       break;
1370    case WINSYS_HANDLE_TYPE_FD:
1371       type = amdgpu_bo_handle_type_dma_buf_fd;
1372       break;
1373    default:
1374       return NULL;
1375    }
1376
1377    if (stride)
1378       *stride = whandle->stride;
1379    if (offset)
1380       *offset = whandle->offset;
1381
1382    r = amdgpu_bo_import(ws->dev, type, whandle->handle, &result);
1383    if (r)
1384       return NULL;
1385
1386    simple_mtx_lock(&ws->bo_export_table_lock);
1387    bo = util_hash_table_get(ws->bo_export_table, result.buf_handle);
1388
1389    /* If the amdgpu_winsys_bo instance already exists, bump the reference
1390     * counter and return it.
1391     */
1392    if (bo) {
1393       p_atomic_inc(&bo->base.reference.count);
1394       simple_mtx_unlock(&ws->bo_export_table_lock);
1395
1396       /* Release the buffer handle, because we don't need it anymore.
1397        * This function is returning an existing buffer, which has its own
1398        * handle.
1399        */
1400       amdgpu_bo_free(result.buf_handle);
1401       return &bo->base;
1402    }
1403
1404    /* Get initial domains. */
1405    r = amdgpu_bo_query_info(result.buf_handle, &info);
1406    if (r)
1407       goto error;
1408
1409    r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
1410                              result.alloc_size, 1 << 20, 0, &va, &va_handle,
1411                              AMDGPU_VA_RANGE_HIGH);
1412    if (r)
1413       goto error;
1414
1415    bo = CALLOC_STRUCT(amdgpu_winsys_bo);
1416    if (!bo)
1417       goto error;
1418
1419    r = amdgpu_bo_va_op(result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP);
1420    if (r)
1421       goto error;
1422
1423    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
1424       initial |= RADEON_DOMAIN_VRAM;
1425    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
1426       initial |= RADEON_DOMAIN_GTT;
1427
1428    /* Initialize the structure. */
1429    simple_mtx_init(&bo->lock, mtx_plain);
1430    pipe_reference_init(&bo->base.reference, 1);
1431    bo->base.alignment = info.phys_alignment;
1432    bo->bo = result.buf_handle;
1433    bo->base.size = result.alloc_size;
1434    bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
1435    bo->ws = ws;
1436    bo->va = va;
1437    bo->u.real.va_handle = va_handle;
1438    bo->initial_domain = initial;
1439    bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
1440    bo->is_shared = true;
1441
1442    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
1443       ws->allocated_vram += align64(bo->base.size, ws->info.gart_page_size);
1444    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
1445       ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size);
1446
1447    amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
1448
1449    amdgpu_add_buffer_to_global_list(bo);
1450
1451    util_hash_table_set(ws->bo_export_table, bo->bo, bo);
1452    simple_mtx_unlock(&ws->bo_export_table_lock);
1453
1454    return &bo->base;
1455
1456 error:
1457    simple_mtx_unlock(&ws->bo_export_table_lock);
1458    if (bo)
1459       FREE(bo);
1460    if (va_handle)
1461       amdgpu_va_range_free(va_handle);
1462    amdgpu_bo_free(result.buf_handle);
1463    return NULL;
1464 }
1465
1466 static bool amdgpu_bo_get_handle(struct pb_buffer *buffer,
1467                                  unsigned stride, unsigned offset,
1468                                  unsigned slice_size,
1469                                  struct winsys_handle *whandle)
1470 {
1471    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer);
1472    struct amdgpu_winsys *ws = bo->ws;
1473    enum amdgpu_bo_handle_type type;
1474    int r;
1475
1476    /* Don't allow exports of slab entries and sparse buffers. */
1477    if (!bo->bo)
1478       return false;
1479
1480    bo->u.real.use_reusable_pool = false;
1481
1482    switch (whandle->type) {
1483    case WINSYS_HANDLE_TYPE_SHARED:
1484       type = amdgpu_bo_handle_type_gem_flink_name;
1485       break;
1486    case WINSYS_HANDLE_TYPE_FD:
1487       type = amdgpu_bo_handle_type_dma_buf_fd;
1488       break;
1489    case WINSYS_HANDLE_TYPE_KMS:
1490       type = amdgpu_bo_handle_type_kms;
1491       break;
1492    default:
1493       return false;
1494    }
1495
1496    r = amdgpu_bo_export(bo->bo, type, &whandle->handle);
1497    if (r)
1498       return false;
1499
1500    simple_mtx_lock(&ws->bo_export_table_lock);
1501    util_hash_table_set(ws->bo_export_table, bo->bo, bo);
1502    simple_mtx_unlock(&ws->bo_export_table_lock);
1503
1504    whandle->stride = stride;
1505    whandle->offset = offset;
1506    whandle->offset += slice_size * whandle->layer;
1507    bo->is_shared = true;
1508    return true;
1509 }
1510
1511 static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
1512                                             void *pointer, uint64_t size)
1513 {
1514     struct amdgpu_winsys *ws = amdgpu_winsys(rws);
1515     amdgpu_bo_handle buf_handle;
1516     struct amdgpu_winsys_bo *bo;
1517     uint64_t va;
1518     amdgpu_va_handle va_handle;
1519     /* Avoid failure when the size is not page aligned */
1520     uint64_t aligned_size = align64(size, ws->info.gart_page_size);
1521
1522     bo = CALLOC_STRUCT(amdgpu_winsys_bo);
1523     if (!bo)
1524         return NULL;
1525
1526     if (amdgpu_create_bo_from_user_mem(ws->dev, pointer,
1527                                        aligned_size, &buf_handle))
1528         goto error;
1529
1530     if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
1531                               aligned_size, 1 << 12, 0, &va, &va_handle,
1532                               AMDGPU_VA_RANGE_HIGH))
1533         goto error_va_alloc;
1534
1535     if (amdgpu_bo_va_op(buf_handle, 0, aligned_size, va, 0, AMDGPU_VA_OP_MAP))
1536         goto error_va_map;
1537
1538     /* Initialize it. */
1539     bo->is_user_ptr = true;
1540     pipe_reference_init(&bo->base.reference, 1);
1541     simple_mtx_init(&bo->lock, mtx_plain);
1542     bo->bo = buf_handle;
1543     bo->base.alignment = 0;
1544     bo->base.size = size;
1545     bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
1546     bo->ws = ws;
1547     bo->cpu_ptr = pointer;
1548     bo->va = va;
1549     bo->u.real.va_handle = va_handle;
1550     bo->initial_domain = RADEON_DOMAIN_GTT;
1551     bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
1552
1553     ws->allocated_gtt += aligned_size;
1554
1555     amdgpu_add_buffer_to_global_list(bo);
1556
1557     amdgpu_bo_export(bo->bo, amdgpu_bo_handle_type_kms, &bo->u.real.kms_handle);
1558
1559     return (struct pb_buffer*)bo;
1560
1561 error_va_map:
1562     amdgpu_va_range_free(va_handle);
1563
1564 error_va_alloc:
1565     amdgpu_bo_free(buf_handle);
1566
1567 error:
1568     FREE(bo);
1569     return NULL;
1570 }
1571
1572 static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf)
1573 {
1574    return ((struct amdgpu_winsys_bo*)buf)->is_user_ptr;
1575 }
1576
1577 static bool amdgpu_bo_is_suballocated(struct pb_buffer *buf)
1578 {
1579    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
1580
1581    return !bo->bo && !bo->sparse;
1582 }
1583
1584 static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf)
1585 {
1586    return ((struct amdgpu_winsys_bo*)buf)->va;
1587 }
1588
1589 void amdgpu_bo_init_functions(struct amdgpu_winsys *ws)
1590 {
1591    ws->base.buffer_set_metadata = amdgpu_buffer_set_metadata;
1592    ws->base.buffer_get_metadata = amdgpu_buffer_get_metadata;
1593    ws->base.buffer_map = amdgpu_bo_map;
1594    ws->base.buffer_unmap = amdgpu_bo_unmap;
1595    ws->base.buffer_wait = amdgpu_bo_wait;
1596    ws->base.buffer_create = amdgpu_bo_create;
1597    ws->base.buffer_from_handle = amdgpu_bo_from_handle;
1598    ws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
1599    ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr;
1600    ws->base.buffer_is_suballocated = amdgpu_bo_is_suballocated;
1601    ws->base.buffer_get_handle = amdgpu_bo_get_handle;
1602    ws->base.buffer_commit = amdgpu_bo_sparse_commit;
1603    ws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
1604    ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;
1605 }