winsys/amdgpu: use only one fence per BO

[mesa.git] / src / gallium / winsys / amdgpu / drm / amdgpu_bo.c
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c

index 2555d57603e48b4fa3a11501193d86e094461010..a6d4aa4aaa298b22571aefd6c6b63941275804fe 100644 (file)
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -44,7 +44,6 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
     struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
     struct amdgpu_winsys *ws = bo->ws;
     int64_t abs_timeout;
-   int i;
  
     if (timeout == 0) {
        if (p_atomic_read(&bo->num_active_ioctls))
@@ -75,49 +74,42 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
  
     if (timeout == 0) {
        pipe_mutex_lock(ws->bo_fence_lock);
-      for (i = 0; i < RING_LAST; i++)
-         if (bo->fence[i]) {
-            if (amdgpu_fence_wait(bo->fence[i], 0, false)) {
-               /* Release the idle fence to avoid checking it again later. */
-               amdgpu_fence_reference(&bo->fence[i], NULL);
-            } else {
-               pipe_mutex_unlock(ws->bo_fence_lock);
-               return false;
-            }
+      if (bo->fence) {
+         if (amdgpu_fence_wait(bo->fence, 0, false)) {
+            /* Release the idle fence to avoid checking it again later. */
+            amdgpu_fence_reference(&bo->fence, NULL);
+         } else {
+            pipe_mutex_unlock(ws->bo_fence_lock);
+            return false;
           }
+      }
        pipe_mutex_unlock(ws->bo_fence_lock);
        return true;
  
     } else {
-      struct pipe_fence_handle *fence[RING_LAST] = {};
-      bool fence_idle[RING_LAST] = {};
+      struct pipe_fence_handle *fence = NULL;
+      bool fence_idle = false;
        bool buffer_idle = true;
  
-      /* Take references to all fences, so that we can wait for them
+      /* Take a reference to the fences, so that we can wait for it
         * without the lock. */
        pipe_mutex_lock(ws->bo_fence_lock);
-      for (i = 0; i < RING_LAST; i++)
-         amdgpu_fence_reference(&fence[i], bo->fence[i]);
+      amdgpu_fence_reference(&fence, bo->fence);
        pipe_mutex_unlock(ws->bo_fence_lock);
  
-      /* Now wait for the fences. */
-      for (i = 0; i < RING_LAST; i++) {
-         if (fence[i]) {
-            if (amdgpu_fence_wait(fence[i], abs_timeout, true))
-               fence_idle[i] = true;
-            else
-               buffer_idle = false;
-         }
+      /* Now wait for the fence. */
+      if (fence) {
+         if (amdgpu_fence_wait(fence, abs_timeout, true))
+            fence_idle = true;
+         else
+            buffer_idle = false;
        }
  
        /* Release idle fences to avoid checking them again later. */
        pipe_mutex_lock(ws->bo_fence_lock);
-      for (i = 0; i < RING_LAST; i++) {
-         if (fence[i] == bo->fence[i] && fence_idle[i])
-            amdgpu_fence_reference(&bo->fence[i], NULL);
-
-         amdgpu_fence_reference(&fence[i], NULL);
-      }
+      if (fence == bo->fence && fence_idle)
+         amdgpu_fence_reference(&bo->fence, NULL);
+      amdgpu_fence_reference(&fence, NULL);
        pipe_mutex_unlock(ws->bo_fence_lock);
  
        return buffer_idle;
@@ -133,7 +125,6 @@ static enum radeon_bo_domain amdgpu_bo_get_initial_domain(
  void amdgpu_bo_destroy(struct pb_buffer *_buf)
  {
     struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
-   int i;
  
     pipe_mutex_lock(bo->ws->global_bo_list_lock);
     LIST_DEL(&bo->global_list_item);
@@ -144,13 +135,20 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf)
     amdgpu_va_range_free(bo->va_handle);
     amdgpu_bo_free(bo->bo);
  
-   for (i = 0; i < RING_LAST; i++)
-      amdgpu_fence_reference(&bo->fence[i], NULL);
+   amdgpu_fence_reference(&bo->fence, NULL);
  
     if (bo->initial_domain & RADEON_DOMAIN_VRAM)
        bo->ws->allocated_vram -= align64(bo->base.size, bo->ws->info.gart_page_size);
     else if (bo->initial_domain & RADEON_DOMAIN_GTT)
        bo->ws->allocated_gtt -= align64(bo->base.size, bo->ws->info.gart_page_size);
+
+   if (bo->map_count >= 1) {
+      if (bo->initial_domain & RADEON_DOMAIN_VRAM)
+         bo->ws->mapped_vram -= bo->base.size;
+      else
+         bo->ws->mapped_gtt -= bo->base.size;
+   }
+
     FREE(bo);
  }
  
@@ -256,14 +254,33 @@ static void *amdgpu_bo_map(struct pb_buffer *buf,
        /* Clear the cache and try again. */
        pb_cache_release_all_buffers(&bo->ws->bo_cache);
        r = amdgpu_bo_cpu_map(bo->bo, &cpu);
+      if (r)
+         return NULL;
+   }
+
+   if (p_atomic_inc_return(&bo->map_count) == 1) {
+      if (bo->initial_domain & RADEON_DOMAIN_VRAM)
+         bo->ws->mapped_vram += bo->base.size;
+      else
+         bo->ws->mapped_gtt += bo->base.size;
     }
-   return r ? NULL : cpu;
+   return cpu;
  }
  
  static void amdgpu_bo_unmap(struct pb_buffer *buf)
  {
     struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
  
+   if (bo->user_ptr)
+      return;
+
+   if (p_atomic_dec_zero(&bo->map_count)) {
+      if (bo->initial_domain & RADEON_DOMAIN_VRAM)
+         bo->ws->mapped_vram -= bo->base.size;
+      else
+         bo->ws->mapped_gtt -= bo->base.size;
+   }
+
     amdgpu_bo_cpu_unmap(bo->bo);
  }
  
@@ -287,13 +304,15 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
                                                   unsigned alignment,
                                                   unsigned usage,
                                                   enum radeon_bo_domain initial_domain,
-                                                 unsigned flags)
+                                                 unsigned flags,
+                                                 unsigned pb_cache_bucket)
  {
     struct amdgpu_bo_alloc_request request = {0};
     amdgpu_bo_handle buf_handle;
     uint64_t va = 0;
     struct amdgpu_winsys_bo *bo;
     amdgpu_va_handle va_handle;
+   unsigned va_gap_size;
     int r;
  
     assert(initial_domain & RADEON_DOMAIN_VRAM_GTT);
@@ -302,7 +321,8 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
        return NULL;
     }
  
-   pb_cache_init_entry(&ws->bo_cache, &bo->cache_entry, &bo->base);
+   pb_cache_init_entry(&ws->bo_cache, &bo->cache_entry, &bo->base,
+                       pb_cache_bucket);
     request.alloc_size = size;
     request.phys_alignment = alignment;
  
@@ -327,8 +347,9 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
        goto error_bo_alloc;
     }
  
+   va_gap_size = ws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
     r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
-                             size, alignment, 0, &va, &va_handle, 0);
+                             size + va_gap_size, alignment, 0, &va, &va_handle, 0);
     if (r)
        goto error_va_alloc;
  
@@ -485,7 +506,7 @@ amdgpu_bo_create(struct radeon_winsys *rws,
  {
     struct amdgpu_winsys *ws = amdgpu_winsys(rws);
     struct amdgpu_winsys_bo *bo;
-   unsigned usage = 0;
+   unsigned usage = 0, pb_cache_bucket;
  
     /* Align size to page size. This is the minimum alignment for normal
      * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
@@ -504,18 +525,31 @@ amdgpu_bo_create(struct radeon_winsys *rws,
     assert(flags < sizeof(usage) * 8 - 3);
     usage |= 1 << (flags + 3);
  
+   /* Determine the pb_cache bucket for minimizing pb_cache misses. */
+   pb_cache_bucket = 0;
+   if (size <= 4096) /* small buffers */
+      pb_cache_bucket += 1;
+   if (domain & RADEON_DOMAIN_VRAM) /* VRAM or VRAM+GTT */
+      pb_cache_bucket += 2;
+   if (flags == RADEON_FLAG_GTT_WC) /* WC */
+      pb_cache_bucket += 4;
+   assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
+
     /* Get a buffer from the cache. */
     bo = (struct amdgpu_winsys_bo*)
-        pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage);
+        pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
+                                pb_cache_bucket);
     if (bo)
        return &bo->base;
  
     /* Create a new one. */
-   bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags);
+   bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
+                         pb_cache_bucket);
     if (!bo) {
        /* Clear the cache and try again. */
        pb_cache_release_all_buffers(&ws->bo_cache);
-      bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags);
+      bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
+                            pb_cache_bucket);
        if (!bo)
           return NULL;
     }
@@ -617,10 +651,10 @@ error:
     return NULL;
  }
  
-static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer,
-                                    unsigned stride, unsigned offset,
-                                    unsigned slice_size,
-                                    struct winsys_handle *whandle)
+static bool amdgpu_bo_get_handle(struct pb_buffer *buffer,
+                                 unsigned stride, unsigned offset,
+                                 unsigned slice_size,
+                                 struct winsys_handle *whandle)
  {
     struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer);
     enum amdgpu_bo_handle_type type;
@@ -639,18 +673,18 @@ static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer,
        type = amdgpu_bo_handle_type_kms;
        break;
     default:
-      return FALSE;
+      return false;
     }
  
     r = amdgpu_bo_export(bo->bo, type, &whandle->handle);
     if (r)
-      return FALSE;
+      return false;
  
     whandle->stride = stride;
     whandle->offset = offset;
     whandle->offset += slice_size * whandle->layer;
     bo->is_shared = true;
-   return TRUE;
+   return true;
  }
  
  static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,