winsys/amdgpu: use only one fence per BO
[mesa.git] / src / gallium / winsys / amdgpu / drm / amdgpu_bo.c
index 50c42e3599af24867073cb658dc9292148556f67..a6d4aa4aaa298b22571aefd6c6b63941275804fe 100644 (file)
 #include <amdgpu_drm.h>
 #include <xf86drm.h>
 #include <stdio.h>
+#include <inttypes.h>
 
-static const struct pb_vtbl amdgpu_winsys_bo_vtbl;
-
-static inline struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo)
-{
-   assert(bo->vtbl == &amdgpu_winsys_bo_vtbl);
-   return (struct amdgpu_winsys_bo *)bo;
-}
-
-struct amdgpu_bomgr {
-   struct pb_manager base;
-   struct amdgpu_winsys *rws;
-};
-
-static struct amdgpu_winsys *get_winsys(struct pb_manager *mgr)
+static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
+                           enum radeon_bo_usage usage)
 {
-   return ((struct amdgpu_bomgr*)mgr)->rws;
-}
+   struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
+   struct amdgpu_winsys *ws = bo->ws;
+   int64_t abs_timeout;
 
-static struct amdgpu_winsys_bo *get_amdgpu_winsys_bo(struct pb_buffer *_buf)
-{
-   struct amdgpu_winsys_bo *bo = NULL;
+   if (timeout == 0) {
+      if (p_atomic_read(&bo->num_active_ioctls))
+         return false;
 
-   if (_buf->vtbl == &amdgpu_winsys_bo_vtbl) {
-      bo = amdgpu_winsys_bo(_buf);
    } else {
-      struct pb_buffer *base_buf;
-      pb_size offset;
-      pb_get_base_buffer(_buf, &base_buf, &offset);
+      abs_timeout = os_time_get_absolute_timeout(timeout);
 
-      if (base_buf->vtbl == &amdgpu_winsys_bo_vtbl)
-         bo = amdgpu_winsys_bo(base_buf);
+      /* Wait if any ioctl is being submitted with this buffer. */
+      if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout))
+         return false;
    }
 
-   return bo;
-}
-
-static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
-                           enum radeon_bo_usage usage)
-{
-   struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf);
-   struct amdgpu_winsys *ws = bo->rws;
-   int i;
-
    if (bo->is_shared) {
       /* We can't use user fences for shared buffers, because user fences
        * are local to this process only. If we want to wait for all buffer
@@ -96,52 +73,43 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
    }
 
    if (timeout == 0) {
-      /* Timeout == 0 is quite simple. */
       pipe_mutex_lock(ws->bo_fence_lock);
-      for (i = 0; i < RING_LAST; i++)
-         if (bo->fence[i]) {
-            if (amdgpu_fence_wait(bo->fence[i], 0, false)) {
-               /* Release the idle fence to avoid checking it again later. */
-               amdgpu_fence_reference(&bo->fence[i], NULL);
-            } else {
-               pipe_mutex_unlock(ws->bo_fence_lock);
-               return false;
-            }
+      if (bo->fence) {
+         if (amdgpu_fence_wait(bo->fence, 0, false)) {
+            /* Release the idle fence to avoid checking it again later. */
+            amdgpu_fence_reference(&bo->fence, NULL);
+         } else {
+            pipe_mutex_unlock(ws->bo_fence_lock);
+            return false;
          }
+      }
       pipe_mutex_unlock(ws->bo_fence_lock);
       return true;
 
    } else {
-      struct pipe_fence_handle *fence[RING_LAST] = {};
-      bool fence_idle[RING_LAST] = {};
+      struct pipe_fence_handle *fence = NULL;
+      bool fence_idle = false;
       bool buffer_idle = true;
-      int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
 
-      /* Take references to all fences, so that we can wait for them
+      /* Take a reference to the fences, so that we can wait for it
        * without the lock. */
       pipe_mutex_lock(ws->bo_fence_lock);
-      for (i = 0; i < RING_LAST; i++)
-         amdgpu_fence_reference(&fence[i], bo->fence[i]);
+      amdgpu_fence_reference(&fence, bo->fence);
       pipe_mutex_unlock(ws->bo_fence_lock);
 
-      /* Now wait for the fences. */
-      for (i = 0; i < RING_LAST; i++) {
-         if (fence[i]) {
-            if (amdgpu_fence_wait(fence[i], abs_timeout, true))
-               fence_idle[i] = true;
-            else
-               buffer_idle = false;
-         }
+      /* Now wait for the fence. */
+      if (fence) {
+         if (amdgpu_fence_wait(fence, abs_timeout, true))
+            fence_idle = true;
+         else
+            buffer_idle = false;
       }
 
       /* Release idle fences to avoid checking them again later. */
       pipe_mutex_lock(ws->bo_fence_lock);
-      for (i = 0; i < RING_LAST; i++) {
-         if (fence[i] == bo->fence[i] && fence_idle[i])
-            amdgpu_fence_reference(&bo->fence[i], NULL);
-
-         amdgpu_fence_reference(&fence[i], NULL);
-      }
+      if (fence == bo->fence && fence_idle)
+         amdgpu_fence_reference(&bo->fence, NULL);
+      amdgpu_fence_reference(&fence, NULL);
       pipe_mutex_unlock(ws->bo_fence_lock);
 
       return buffer_idle;
@@ -149,31 +117,52 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
 }
 
 static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
-      struct radeon_winsys_cs_handle *buf)
+      struct pb_buffer *buf)
 {
    return ((struct amdgpu_winsys_bo*)buf)->initial_domain;
 }
 
-static void amdgpu_bo_destroy(struct pb_buffer *_buf)
+void amdgpu_bo_destroy(struct pb_buffer *_buf)
 {
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
-   int i;
+
+   pipe_mutex_lock(bo->ws->global_bo_list_lock);
+   LIST_DEL(&bo->global_list_item);
+   bo->ws->num_buffers--;
+   pipe_mutex_unlock(bo->ws->global_bo_list_lock);
 
    amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
    amdgpu_va_range_free(bo->va_handle);
    amdgpu_bo_free(bo->bo);
 
-   for (i = 0; i < RING_LAST; i++)
-      amdgpu_fence_reference(&bo->fence[i], NULL);
+   amdgpu_fence_reference(&bo->fence, NULL);
 
    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
-      bo->rws->allocated_vram -= align(bo->base.size, bo->rws->gart_page_size);
+      bo->ws->allocated_vram -= align64(bo->base.size, bo->ws->info.gart_page_size);
    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
-      bo->rws->allocated_gtt -= align(bo->base.size, bo->rws->gart_page_size);
+      bo->ws->allocated_gtt -= align64(bo->base.size, bo->ws->info.gart_page_size);
+
+   if (bo->map_count >= 1) {
+      if (bo->initial_domain & RADEON_DOMAIN_VRAM)
+         bo->ws->mapped_vram -= bo->base.size;
+      else
+         bo->ws->mapped_gtt -= bo->base.size;
+   }
+
    FREE(bo);
 }
 
-static void *amdgpu_bo_map(struct radeon_winsys_cs_handle *buf,
+static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf)
+{
+   struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
+
+   if (bo->use_reusable_pool)
+      pb_cache_add_buffer(&bo->cache_entry);
+   else
+      amdgpu_bo_destroy(_buf);
+}
+
+static void *amdgpu_bo_map(struct pb_buffer *buf,
                            struct radeon_winsys_cs *rcs,
                            enum pipe_transfer_usage usage)
 {
@@ -229,19 +218,30 @@ static void *amdgpu_bo_map(struct radeon_winsys_cs_handle *buf,
             if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo,
                                                                RADEON_USAGE_WRITE)) {
                cs->flush_cs(cs->flush_data, 0, NULL);
+            } else {
+               /* Try to avoid busy-waiting in amdgpu_bo_wait. */
+               if (p_atomic_read(&bo->num_active_ioctls))
+                  amdgpu_cs_sync_flush(rcs);
             }
             amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
                            RADEON_USAGE_WRITE);
          } else {
             /* Mapping for write. */
-            if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo))
-               cs->flush_cs(cs->flush_data, 0, NULL);
+            if (cs) {
+               if (amdgpu_bo_is_referenced_by_cs(cs, bo)) {
+                  cs->flush_cs(cs->flush_data, 0, NULL);
+               } else {
+                  /* Try to avoid busy-waiting in amdgpu_bo_wait. */
+                  if (p_atomic_read(&bo->num_active_ioctls))
+                     amdgpu_cs_sync_flush(rcs);
+               }
+            }
 
             amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE,
                            RADEON_USAGE_READWRITE);
          }
 
-         bo->rws->buffer_wait_time += os_time_get_nano() - time;
+         bo->ws->buffer_wait_time += os_time_get_nano() - time;
       }
    }
 
@@ -250,90 +250,106 @@ static void *amdgpu_bo_map(struct radeon_winsys_cs_handle *buf,
        return bo->user_ptr;
 
    r = amdgpu_bo_cpu_map(bo->bo, &cpu);
-   return r ? NULL : cpu;
+   if (r) {
+      /* Clear the cache and try again. */
+      pb_cache_release_all_buffers(&bo->ws->bo_cache);
+      r = amdgpu_bo_cpu_map(bo->bo, &cpu);
+      if (r)
+         return NULL;
+   }
+
+   if (p_atomic_inc_return(&bo->map_count) == 1) {
+      if (bo->initial_domain & RADEON_DOMAIN_VRAM)
+         bo->ws->mapped_vram += bo->base.size;
+      else
+         bo->ws->mapped_gtt += bo->base.size;
+   }
+   return cpu;
 }
 
-static void amdgpu_bo_unmap(struct radeon_winsys_cs_handle *buf)
+static void amdgpu_bo_unmap(struct pb_buffer *buf)
 {
    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
 
+   if (bo->user_ptr)
+      return;
+
+   if (p_atomic_dec_zero(&bo->map_count)) {
+      if (bo->initial_domain & RADEON_DOMAIN_VRAM)
+         bo->ws->mapped_vram -= bo->base.size;
+      else
+         bo->ws->mapped_gtt -= bo->base.size;
+   }
+
    amdgpu_bo_cpu_unmap(bo->bo);
 }
 
-static void amdgpu_bo_get_base_buffer(struct pb_buffer *buf,
-                                      struct pb_buffer **base_buf,
-                                      unsigned *offset)
-{
-   *base_buf = buf;
-   *offset = 0;
-}
+static const struct pb_vtbl amdgpu_winsys_bo_vtbl = {
+   amdgpu_bo_destroy_or_cache
+   /* other functions are never called */
+};
 
-static enum pipe_error amdgpu_bo_validate(struct pb_buffer *_buf,
-                                          struct pb_validate *vl,
-                                          unsigned flags)
+static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo)
 {
-   /* Always pinned */
-   return PIPE_OK;
-}
+   struct amdgpu_winsys *ws = bo->ws;
 
-static void amdgpu_bo_fence(struct pb_buffer *buf,
-                            struct pipe_fence_handle *fence)
-{
+   pipe_mutex_lock(ws->global_bo_list_lock);
+   LIST_ADDTAIL(&bo->global_list_item, &ws->global_bo_list);
+   ws->num_buffers++;
+   pipe_mutex_unlock(ws->global_bo_list_lock);
 }
 
-static const struct pb_vtbl amdgpu_winsys_bo_vtbl = {
-   amdgpu_bo_destroy,
-   NULL, /* never called */
-   NULL, /* never called */
-   amdgpu_bo_validate,
-   amdgpu_bo_fence,
-   amdgpu_bo_get_base_buffer,
-};
-
-static struct pb_buffer *amdgpu_bomgr_create_bo(struct pb_manager *_mgr,
-                                                pb_size size,
-                                                const struct pb_desc *desc)
+static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
+                                                 uint64_t size,
+                                                 unsigned alignment,
+                                                 unsigned usage,
+                                                 enum radeon_bo_domain initial_domain,
+                                                 unsigned flags,
+                                                 unsigned pb_cache_bucket)
 {
-   struct amdgpu_winsys *rws = get_winsys(_mgr);
-   struct amdgpu_bo_desc *rdesc = (struct amdgpu_bo_desc*)desc;
    struct amdgpu_bo_alloc_request request = {0};
    amdgpu_bo_handle buf_handle;
    uint64_t va = 0;
    struct amdgpu_winsys_bo *bo;
    amdgpu_va_handle va_handle;
+   unsigned va_gap_size;
    int r;
 
-   assert(rdesc->initial_domain & RADEON_DOMAIN_VRAM_GTT);
+   assert(initial_domain & RADEON_DOMAIN_VRAM_GTT);
    bo = CALLOC_STRUCT(amdgpu_winsys_bo);
    if (!bo) {
       return NULL;
    }
 
+   pb_cache_init_entry(&ws->bo_cache, &bo->cache_entry, &bo->base,
+                       pb_cache_bucket);
    request.alloc_size = size;
-   request.phys_alignment = desc->alignment;
+   request.phys_alignment = alignment;
 
-   if (rdesc->initial_domain & RADEON_DOMAIN_VRAM) {
+   if (initial_domain & RADEON_DOMAIN_VRAM)
       request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
-      if (rdesc->flags & RADEON_FLAG_CPU_ACCESS)
-         request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
-   }
-   if (rdesc->initial_domain & RADEON_DOMAIN_GTT) {
+   if (initial_domain & RADEON_DOMAIN_GTT)
       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
-      if (rdesc->flags & RADEON_FLAG_GTT_WC)
-         request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
-   }
 
-   r = amdgpu_bo_alloc(rws->dev, &request, &buf_handle);
+   if (flags & RADEON_FLAG_CPU_ACCESS)
+      request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
+   if (flags & RADEON_FLAG_NO_CPU_ACCESS)
+      request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
+   if (flags & RADEON_FLAG_GTT_WC)
+      request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
+
+   r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
    if (r) {
       fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
-      fprintf(stderr, "amdgpu:    size      : %d bytes\n", size);
-      fprintf(stderr, "amdgpu:    alignment : %d bytes\n", desc->alignment);
-      fprintf(stderr, "amdgpu:    domains   : %d\n", rdesc->initial_domain);
+      fprintf(stderr, "amdgpu:    size      : %"PRIu64" bytes\n", size);
+      fprintf(stderr, "amdgpu:    alignment : %u bytes\n", alignment);
+      fprintf(stderr, "amdgpu:    domains   : %u\n", initial_domain);
       goto error_bo_alloc;
    }
 
-   r = amdgpu_va_range_alloc(rws->dev, amdgpu_gpu_va_range_general,
-                             size, desc->alignment, 0, &va, &va_handle, 0);
+   va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
+   r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
+                             size + va_gap_size, alignment, 0, &va, &va_handle, 0);
    if (r)
       goto error_va_alloc;
 
@@ -342,23 +358,25 @@ static struct pb_buffer *amdgpu_bomgr_create_bo(struct pb_manager *_mgr,
       goto error_va_map;
 
    pipe_reference_init(&bo->base.reference, 1);
-   bo->base.alignment = desc->alignment;
-   bo->base.usage = desc->usage;
+   bo->base.alignment = alignment;
+   bo->base.usage = usage;
    bo->base.size = size;
    bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
-   bo->rws = rws;
+   bo->ws = ws;
    bo->bo = buf_handle;
    bo->va = va;
    bo->va_handle = va_handle;
-   bo->initial_domain = rdesc->initial_domain;
-   bo->unique_id = __sync_fetch_and_add(&rws->next_bo_unique_id, 1);
+   bo->initial_domain = initial_domain;
+   bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
 
-   if (rdesc->initial_domain & RADEON_DOMAIN_VRAM)
-      rws->allocated_vram += align(size, rws->gart_page_size);
-   else if (rdesc->initial_domain & RADEON_DOMAIN_GTT)
-      rws->allocated_gtt += align(size, rws->gart_page_size);
+   if (initial_domain & RADEON_DOMAIN_VRAM)
+      ws->allocated_vram += align64(size, ws->info.gart_page_size);
+   else if (initial_domain & RADEON_DOMAIN_GTT)
+      ws->allocated_gtt += align64(size, ws->info.gart_page_size);
 
-   return &bo->base;
+   amdgpu_add_buffer_to_global_list(bo);
+
+   return bo;
 
 error_va_map:
    amdgpu_va_range_free(va_handle);
@@ -371,48 +389,15 @@ error_bo_alloc:
    return NULL;
 }
 
-static void amdgpu_bomgr_flush(struct pb_manager *mgr)
-{
-   /* NOP */
-}
-
-/* This is for the cache bufmgr. */
-static boolean amdgpu_bomgr_is_buffer_busy(struct pb_manager *_mgr,
-                                           struct pb_buffer *_buf)
+bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf)
 {
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
 
    if (amdgpu_bo_is_referenced_by_any_cs(bo)) {
-      return TRUE;
-   }
-
-   if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_READWRITE)) {
-      return TRUE;
+      return false;
    }
 
-   return FALSE;
-}
-
-static void amdgpu_bomgr_destroy(struct pb_manager *mgr)
-{
-   FREE(mgr);
-}
-
-struct pb_manager *amdgpu_bomgr_create(struct amdgpu_winsys *rws)
-{
-   struct amdgpu_bomgr *mgr;
-
-   mgr = CALLOC_STRUCT(amdgpu_bomgr);
-   if (!mgr)
-      return NULL;
-
-   mgr->base.destroy = amdgpu_bomgr_destroy;
-   mgr->base.create_buffer = amdgpu_bomgr_create_bo;
-   mgr->base.flush = amdgpu_bomgr_flush;
-   mgr->base.is_buffer_busy = amdgpu_bomgr_is_buffer_busy;
-
-   mgr->rws = rws;
-   return &mgr->base;
+   return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE);
 }
 
 static unsigned eg_tile_split(unsigned tile_split)
@@ -444,16 +429,10 @@ static unsigned eg_tile_split_rev(unsigned eg_tile_split)
    }
 }
 
-static void amdgpu_bo_get_tiling(struct pb_buffer *_buf,
-                                 enum radeon_bo_layout *microtiled,
-                                 enum radeon_bo_layout *macrotiled,
-                                 unsigned *bankw, unsigned *bankh,
-                                 unsigned *tile_split,
-                                 unsigned *stencil_tile_split,
-                                 unsigned *mtilea,
-                                 bool *scanout)
+static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf,
+                                       struct radeon_bo_metadata *md)
 {
-   struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf);
+   struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
    struct amdgpu_bo_info info = {0};
    uint32_t tiling_flags;
    int r;
@@ -464,132 +443,125 @@ static void amdgpu_bo_get_tiling(struct pb_buffer *_buf,
 
    tiling_flags = info.metadata.tiling_info;
 
-   *microtiled = RADEON_LAYOUT_LINEAR;
-   *macrotiled = RADEON_LAYOUT_LINEAR;
+   md->microtile = RADEON_LAYOUT_LINEAR;
+   md->macrotile = RADEON_LAYOUT_LINEAR;
 
    if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4)  /* 2D_TILED_THIN1 */
-      *macrotiled = RADEON_LAYOUT_TILED;
+      md->macrotile = RADEON_LAYOUT_TILED;
    else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */
-      *microtiled = RADEON_LAYOUT_TILED;
-
-   if (bankw && tile_split && mtilea && tile_split) {
-      *bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH);
-      *bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT);
-      *tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT));
-      *mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT);
-   }
-   if (scanout)
-      *scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */
+      md->microtile = RADEON_LAYOUT_TILED;
+
+   md->pipe_config = AMDGPU_TILING_GET(tiling_flags, PIPE_CONFIG);
+   md->bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH);
+   md->bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT);
+   md->tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT));
+   md->mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT);
+   md->num_banks = 2 << AMDGPU_TILING_GET(tiling_flags, NUM_BANKS);
+   md->scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */
+
+   md->size_metadata = info.metadata.size_metadata;
+   memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata));
 }
 
-static void amdgpu_bo_set_tiling(struct pb_buffer *_buf,
-                                 struct radeon_winsys_cs *rcs,
-                                 enum radeon_bo_layout microtiled,
-                                 enum radeon_bo_layout macrotiled,
-                                 unsigned pipe_config,
-                                 unsigned bankw, unsigned bankh,
-                                 unsigned tile_split,
-                                 unsigned stencil_tile_split,
-                                 unsigned mtilea, unsigned num_banks,
-                                 uint32_t pitch,
-                                 bool scanout)
+static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
+                                       struct radeon_bo_metadata *md)
 {
-   struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf);
+   struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
    struct amdgpu_bo_metadata metadata = {0};
    uint32_t tiling_flags = 0;
 
-   if (macrotiled == RADEON_LAYOUT_TILED)
+   if (md->macrotile == RADEON_LAYOUT_TILED)
       tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */
-   else if (microtiled == RADEON_LAYOUT_TILED)
+   else if (md->microtile == RADEON_LAYOUT_TILED)
       tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */
    else
       tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */
 
-   tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, pipe_config);
-   tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(bankw));
-   tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(bankh));
-   if (tile_split)
-      tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(tile_split));
-   tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(mtilea));
-   tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(num_banks)-1);
+   tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->pipe_config);
+   tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->bankw));
+   tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->bankh));
+   if (md->tile_split)
+      tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(md->tile_split));
+   tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->mtilea));
+   tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->num_banks)-1);
 
-   if (scanout)
+   if (md->scanout)
       tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */
    else
       tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */
 
    metadata.tiling_info = tiling_flags;
+   metadata.size_metadata = md->size_metadata;
+   memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata));
 
    amdgpu_bo_set_metadata(bo->bo, &metadata);
 }
 
-static struct radeon_winsys_cs_handle *amdgpu_get_cs_handle(struct pb_buffer *_buf)
-{
-   /* return a direct pointer to amdgpu_winsys_bo. */
-   return (struct radeon_winsys_cs_handle*)get_amdgpu_winsys_bo(_buf);
-}
-
 static struct pb_buffer *
 amdgpu_bo_create(struct radeon_winsys *rws,
-                 unsigned size,
+                 uint64_t size,
                  unsigned alignment,
-                 boolean use_reusable_pool,
                  enum radeon_bo_domain domain,
                  enum radeon_bo_flag flags)
 {
    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
-   struct amdgpu_bo_desc desc;
-   struct pb_manager *provider;
-   struct pb_buffer *buffer;
-
-   /* Don't use VRAM if the GPU doesn't have much. This is only the initial
-    * domain. The kernel is free to move the buffer if it wants to.
-    *
-    * 64MB means no VRAM by todays standards.
-    */
-   if (domain & RADEON_DOMAIN_VRAM && ws->info.vram_size <= 64*1024*1024) {
-      domain = RADEON_DOMAIN_GTT;
-      flags = RADEON_FLAG_GTT_WC;
-   }
-
-   memset(&desc, 0, sizeof(desc));
-   desc.base.alignment = alignment;
+   struct amdgpu_winsys_bo *bo;
+   unsigned usage = 0, pb_cache_bucket;
 
    /* Align size to page size. This is the minimum alignment for normal
     * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
     * like constant/uniform buffers, can benefit from better and more reuse.
     */
-   size = align(size, ws->gart_page_size);
+   size = align64(size, ws->info.gart_page_size);
+   alignment = align(alignment, ws->info.gart_page_size);
 
    /* Only set one usage bit each for domains and flags, or the cache manager
     * might consider different sets of domains / flags compatible
     */
    if (domain == RADEON_DOMAIN_VRAM_GTT)
-      desc.base.usage = 1 << 2;
-   else
-      desc.base.usage = domain >> 1;
-   assert(flags < sizeof(desc.base.usage) * 8 - 3);
-   desc.base.usage |= 1 << (flags + 3);
-
-   desc.initial_domain = domain;
-   desc.flags = flags;
-
-   /* Assign a buffer manager. */
-   if (use_reusable_pool)
-      provider = ws->cman;
+      usage = 1 << 2;
    else
-      provider = ws->kman;
-
-   buffer = provider->create_buffer(provider, size, &desc.base);
-   if (!buffer)
-      return NULL;
+      usage = domain >> 1;
+   assert(flags < sizeof(usage) * 8 - 3);
+   usage |= 1 << (flags + 3);
+
+   /* Determine the pb_cache bucket for minimizing pb_cache misses. */
+   pb_cache_bucket = 0;
+   if (size <= 4096) /* small buffers */
+      pb_cache_bucket += 1;
+   if (domain & RADEON_DOMAIN_VRAM) /* VRAM or VRAM+GTT */
+      pb_cache_bucket += 2;
+   if (flags == RADEON_FLAG_GTT_WC) /* WC */
+      pb_cache_bucket += 4;
+   assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
+
+   /* Get a buffer from the cache. */
+   bo = (struct amdgpu_winsys_bo*)
+        pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
+                                pb_cache_bucket);
+   if (bo)
+      return &bo->base;
+
+   /* Create a new one. */
+   bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
+                         pb_cache_bucket);
+   if (!bo) {
+      /* Clear the cache and try again. */
+      pb_cache_release_all_buffers(&ws->bo_cache);
+      bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
+                            pb_cache_bucket);
+      if (!bo)
+         return NULL;
+   }
 
-   return (struct pb_buffer*)buffer;
+   bo->use_reusable_pool = true;
+   return &bo->base;
 }
 
 static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
                                                struct winsys_handle *whandle,
-                                               unsigned *stride)
+                                               unsigned *stride,
+                                               unsigned *offset)
 {
    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
    struct amdgpu_winsys_bo *bo;
@@ -644,11 +616,10 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
 
    pipe_reference_init(&bo->base.reference, 1);
    bo->base.alignment = info.phys_alignment;
-   bo->base.usage = PB_USAGE_GPU_WRITE | PB_USAGE_GPU_READ;
    bo->bo = result.buf_handle;
    bo->base.size = result.alloc_size;
    bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
-   bo->rws = ws;
+   bo->ws = ws;
    bo->va = va;
    bo->va_handle = va_handle;
    bo->initial_domain = initial;
@@ -657,11 +628,15 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
 
    if (stride)
       *stride = whandle->stride;
+   if (offset)
+      *offset = whandle->offset;
 
    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
-      ws->allocated_vram += align(bo->base.size, ws->gart_page_size);
+      ws->allocated_vram += align64(bo->base.size, ws->info.gart_page_size);
    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
-      ws->allocated_gtt += align(bo->base.size, ws->gart_page_size);
+      ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size);
+
+   amdgpu_add_buffer_to_global_list(bo);
 
    return &bo->base;
 
@@ -676,14 +651,17 @@ error:
    return NULL;
 }
 
-static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer,
-                                    unsigned stride,
-                                    struct winsys_handle *whandle)
+static bool amdgpu_bo_get_handle(struct pb_buffer *buffer,
+                                 unsigned stride, unsigned offset,
+                                 unsigned slice_size,
+                                 struct winsys_handle *whandle)
 {
-   struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(buffer);
+   struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer);
    enum amdgpu_bo_handle_type type;
    int r;
 
+   bo->use_reusable_pool = false;
+
    switch (whandle->type) {
    case DRM_API_HANDLE_TYPE_SHARED:
       type = amdgpu_bo_handle_type_gem_flink_name;
@@ -695,20 +673,22 @@ static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer,
       type = amdgpu_bo_handle_type_kms;
       break;
    default:
-      return FALSE;
+      return false;
    }
 
    r = amdgpu_bo_export(bo->bo, type, &whandle->handle);
    if (r)
-      return FALSE;
+      return false;
 
    whandle->stride = stride;
+   whandle->offset = offset;
+   whandle->offset += slice_size * whandle->layer;
    bo->is_shared = true;
-   return TRUE;
+   return true;
 }
 
 static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
-                                           void *pointer, unsigned size)
+                                           void *pointer, uint64_t size)
 {
     struct amdgpu_winsys *ws = amdgpu_winsys(rws);
     amdgpu_bo_handle buf_handle;
@@ -734,17 +714,18 @@ static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
     pipe_reference_init(&bo->base.reference, 1);
     bo->bo = buf_handle;
     bo->base.alignment = 0;
-    bo->base.usage = PB_USAGE_GPU_WRITE | PB_USAGE_GPU_READ;
     bo->base.size = size;
     bo->base.vtbl = &amdgpu_winsys_bo_vtbl;
-    bo->rws = ws;
+    bo->ws = ws;
     bo->user_ptr = pointer;
     bo->va = va;
     bo->va_handle = va_handle;
     bo->initial_domain = RADEON_DOMAIN_GTT;
     bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
 
-    ws->allocated_gtt += align(bo->base.size, ws->gart_page_size);
+    ws->allocated_gtt += align64(bo->base.size, ws->info.gart_page_size);
+
+    amdgpu_add_buffer_to_global_list(bo);
 
     return (struct pb_buffer*)bo;
 
@@ -759,22 +740,27 @@ error:
     return NULL;
 }
 
-static uint64_t amdgpu_bo_get_va(struct radeon_winsys_cs_handle *buf)
+static bool amdgpu_bo_is_user_ptr(struct pb_buffer *buf)
+{
+   return ((struct amdgpu_winsys_bo*)buf)->user_ptr != NULL;
+}
+
+static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf)
 {
    return ((struct amdgpu_winsys_bo*)buf)->va;
 }
 
-void amdgpu_bomgr_init_functions(struct amdgpu_winsys *ws)
+void amdgpu_bo_init_functions(struct amdgpu_winsys *ws)
 {
-   ws->base.buffer_get_cs_handle = amdgpu_get_cs_handle;
-   ws->base.buffer_set_tiling = amdgpu_bo_set_tiling;
-   ws->base.buffer_get_tiling = amdgpu_bo_get_tiling;
+   ws->base.buffer_set_metadata = amdgpu_buffer_set_metadata;
+   ws->base.buffer_get_metadata = amdgpu_buffer_get_metadata;
    ws->base.buffer_map = amdgpu_bo_map;
    ws->base.buffer_unmap = amdgpu_bo_unmap;
    ws->base.buffer_wait = amdgpu_bo_wait;
    ws->base.buffer_create = amdgpu_bo_create;
    ws->base.buffer_from_handle = amdgpu_bo_from_handle;
    ws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
+   ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr;
    ws->base.buffer_get_handle = amdgpu_bo_get_handle;
    ws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
    ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;