From 667da4eaed37de143681711344aba19373bee1d0 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 7 Feb 2017 17:04:49 +0100
Subject: [PATCH] winsys/amdgpu: sparse buffer creation / destruction /
 commitment
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

This is the bulk of the buffer allocation logic. It is fairly simple and
stupid. We'll probably want to use e.g. interval trees at some point to
keep track of commitments, but Mesa doesn't have an implementation of those
yet.

v2:
- remove pipe_mutex_*
- fix total_backing_pages accounting
- simplify by using the new VA_OP_CLEAR/REPLACE kernel interface

Reviewed-by: Marek OlÅ¡Ã¡k <marek.olsak@amd.com>
---
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 401 +++++++++++++++++++++-
 1 file changed, 400 insertions(+), 1 deletion(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 122f47c3cbb..517bf532618 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -575,6 +575,395 @@ void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab)
    FREE(slab);
 }
 
+/*
+ * Attempt to allocate the given number of backing pages. Fewer pages may be
+ * allocated (depending on the fragmentation of existing backing buffers),
+ * which will be reflected by a change to *pnum_pages.
+ */
+static struct amdgpu_sparse_backing *
+sparse_backing_alloc(struct amdgpu_winsys_bo *bo, uint32_t *pstart_page, uint32_t *pnum_pages)
+{
+   struct amdgpu_sparse_backing *best_backing;
+   unsigned best_idx;
+   uint32_t best_num_pages;
+
+   best_backing = NULL;
+   best_idx = 0;
+   best_num_pages = 0;
+
+   /* This is a very simple and inefficient best-fit algorithm. */
+   list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) {
+      for (unsigned idx = 0; idx < backing->num_chunks; ++idx) {
+         uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin;
+         if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) ||
+            (best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) {
+            best_backing = backing;
+            best_idx = idx;
+            best_num_pages = cur_num_pages;
+         }
+      }
+   }
+
+   /* Allocate a new backing buffer if necessary. */
+   if (!best_backing) {
+      struct pb_buffer *buf;
+      uint64_t size;
+      uint32_t pages;
+
+      best_backing = CALLOC_STRUCT(amdgpu_sparse_backing);
+      if (!best_backing)
+         return NULL;
+
+      best_backing->max_chunks = 4;
+      best_backing->chunks = CALLOC(best_backing->max_chunks,
+                                    sizeof(*best_backing->chunks));
+      if (!best_backing->chunks) {
+         FREE(best_backing);
+         return NULL;
+      }
+
+      assert(bo->u.sparse.num_backing_pages < DIV_ROUND_UP(bo->base.size, RADEON_SPARSE_PAGE_SIZE));
+
+      size = MIN3(bo->base.size / 16,
+                  8 * 1024 * 1024,
+                  bo->base.size - (uint64_t)bo->u.sparse.num_backing_pages * RADEON_SPARSE_PAGE_SIZE);
+      size = MAX2(size, RADEON_SPARSE_PAGE_SIZE);
+
+      buf = amdgpu_bo_create(&bo->ws->base, size, RADEON_SPARSE_PAGE_SIZE,
+                             bo->initial_domain,
+                             bo->u.sparse.flags | RADEON_FLAG_HANDLE);
+      if (!buf) {
+         FREE(best_backing->chunks);
+         FREE(best_backing);
+         return NULL;
+      }
+
+      /* We might have gotten a bigger buffer than requested via caching. */
+      pages = buf->size / RADEON_SPARSE_PAGE_SIZE;
+
+      best_backing->bo = amdgpu_winsys_bo(buf);
+      best_backing->num_chunks = 1;
+      best_backing->chunks[0].begin = 0;
+      best_backing->chunks[0].end = pages;
+
+      list_add(&best_backing->list, &bo->u.sparse.backing);
+      bo->u.sparse.num_backing_pages += pages;
+
+      best_idx = 0;
+      best_num_pages = pages;
+   }
+
+   *pnum_pages = MIN2(*pnum_pages, best_num_pages);
+   *pstart_page = best_backing->chunks[best_idx].begin;
+   best_backing->chunks[best_idx].begin += *pnum_pages;
+
+   if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) {
+      memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1],
+              sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1));
+      best_backing->num_chunks--;
+   }
+
+   return best_backing;
+}
+
+static void
+sparse_free_backing_buffer(struct amdgpu_sparse_backing *backing)
+{
+   bo->u.sparse.num_backing_pages -= backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE;
+
+   list_del(&backing->list);
+   amdgpu_winsys_bo_reference(&backing->bo, NULL);
+   FREE(backing->chunks);
+   FREE(backing);
+}
+
+/*
+ * Return a range of pages from the given backing buffer back into the
+ * free structure.
+ */
+static bool
+sparse_backing_free(struct amdgpu_winsys_bo *bo,
+                    struct amdgpu_sparse_backing *backing,
+                    uint32_t start_page, uint32_t num_pages)
+{
+   uint32_t end_page = start_page + num_pages;
+   unsigned low = 0;
+   unsigned high = backing->num_chunks;
+
+   /* Find the first chunk with begin >= start_page. */
+   while (low < high) {
+      unsigned mid = low + (high - low) / 2;
+
+      if (backing->chunks[mid].begin >= start_page)
+         high = mid;
+      else
+         low = mid + 1;
+   }
+
+   assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin);
+   assert(low == 0 || backing->chunks[low - 1].end <= start_page);
+
+   if (low > 0 && backing->chunks[low - 1].end == start_page) {
+      backing->chunks[low - 1].end = end_page;
+
+      if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
+         backing->chunks[low - 1].end = backing->chunks[low].end;
+         memmove(&backing->chunks[low], &backing->chunks[low + 1],
+                 sizeof(*backing->chunks) * (backing->num_chunks - low - 1));
+         backing->num_chunks--;
+      }
+   } else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) {
+      backing->chunks[low].begin = start_page;
+   } else {
+      if (backing->num_chunks >= backing->max_chunks) {
+         unsigned new_max_chunks = 2 * backing->max_chunks;
+         struct amdgpu_sparse_backing_chunk *new_chunks =
+            REALLOC(backing->chunks,
+                    sizeof(*backing->chunks) * backing->max_chunks,
+                    sizeof(*backing->chunks) * new_max_chunks);
+         if (!new_chunks)
+            return false;
+
+         backing->max_chunks = new_max_chunks;
+         backing->chunks = new_chunks;
+      }
+
+      memmove(&backing->chunks[low + 1], &backing->chunks[low],
+              sizeof(*backing->chunks) * (backing->num_chunks - low));
+      backing->chunks[low].begin = start_page;
+      backing->chunks[low].end = end_page;
+      backing->num_chunks++;
+   }
+
+   if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 &&
+       backing->chunks[0].end == backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE)
+      sparse_free_backing_buffer(backing);
+
+   return true;
+}
+
+static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf)
+{
+   struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
+   int r;
+
+   assert(!bo->bo && bo->sparse);
+
+   r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0,
+                           (uint64_t)bo->u.sparse.num_va_pages * RADEON_SPARSE_PAGE_SIZE,
+                           bo->va, 0, AMDGPU_VA_OP_CLEAR);
+   if (r) {
+      fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r);
+   }
+
+   while (!list_empty(&bo->u.sparse.backing)) {
+      struct amdgpu_sparse_backing *dummy = NULL;
+      sparse_free_backing_buffer(container_of(bo->u.sparse.backing.next,
+                                              dummy, list));
+   }
+
+   amdgpu_va_range_free(bo->u.sparse.va_handle);
+   mtx_destroy(&bo->u.sparse.commit_lock);
+   FREE(bo->u.sparse.commitments);
+   FREE(bo);
+}
+
+static const struct pb_vtbl amdgpu_winsys_bo_sparse_vtbl = {
+   amdgpu_bo_sparse_destroy
+   /* other functions are never called */
+};
+
+static struct pb_buffer *
+amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size,
+                        enum radeon_bo_domain domain,
+                        enum radeon_bo_flag flags)
+{
+   struct amdgpu_winsys_bo *bo;
+   uint64_t map_size;
+   uint64_t va_gap_size;
+   int r;
+
+   /* We use 32-bit page numbers; refuse to attempt allocating sparse buffers
+    * that exceed this limit. This is not really a restriction: we don't have
+    * that much virtual address space anyway.
+    */
+   if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE)
+      return NULL;
+
+   bo = CALLOC_STRUCT(amdgpu_winsys_bo);
+   if (!bo)
+      return NULL;
+
+   pipe_reference_init(&bo->base.reference, 1);
+   bo->base.alignment = RADEON_SPARSE_PAGE_SIZE;
+   bo->base.size = size;
+   bo->base.vtbl = &amdgpu_winsys_bo_sparse_vtbl;
+   bo->ws = ws;
+   bo->initial_domain = domain;
+   bo->unique_id =  __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
+   bo->sparse = true;
+   bo->u.sparse.flags = flags & ~RADEON_FLAG_SPARSE;
+
+   bo->u.sparse.num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
+   bo->u.sparse.commitments = CALLOC(bo->u.sparse.num_va_pages,
+                                     sizeof(*bo->u.sparse.commitments));
+   if (!bo->u.sparse.commitments)
+      goto error_alloc_commitments;
+
+   mtx_init(&bo->u.sparse.commit_lock, mtx_plain);
+   LIST_INITHEAD(&bo->u.sparse.backing);
+
+   /* For simplicity, we always map a multiple of the page size. */
+   map_size = align64(size, RADEON_SPARSE_PAGE_SIZE);
+   va_gap_size = ws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0;
+   r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
+                             map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE,
+                             0, &bo->va, &bo->u.sparse.va_handle, 0);
+   if (r)
+      goto error_va_alloc;
+
+   r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, size, bo->va,
+                           AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP);
+   if (r)
+      goto error_va_map;
+
+   return &bo->base;
+
+error_va_map:
+   amdgpu_va_range_free(bo->u.sparse.va_handle);
+error_va_alloc:
+   mtx_destroy(&bo->u.sparse.commit_lock);
+   FREE(bo->u.sparse.commitments);
+error_alloc_commitments:
+   FREE(bo);
+   return NULL;
+}
+
+static bool
+amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size,
+                        bool commit)
+{
+   struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf);
+   struct amdgpu_sparse_commitment *comm;
+   uint32_t va_page, end_va_page;
+   bool ok = true;
+   int r;
+
+   assert(bo->sparse);
+   assert(offset % RADEON_SPARSE_PAGE_SIZE == 0);
+   assert(offset <= bo->base.size);
+   assert(size <= bo->base.size - offset);
+   assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->base.size);
+
+   comm = bo->u.sparse.commitments;
+   va_page = offset / RADEON_SPARSE_PAGE_SIZE;
+   end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE);
+
+   mtx_lock(&bo->u.sparse.commit_lock);
+
+   if (commit) {
+      while (va_page < end_va_page) {
+         uint32_t span_va_page;
+
+         /* Skip pages that are already committed. */
+         if (comm[va_page].backing) {
+            va_page++;
+            continue;
+         }
+
+         /* Determine length of uncommitted span. */
+         span_va_page = va_page;
+         while (va_page < end_va_page && !comm[va_page].backing)
+            va_page++;
+
+         /* Fill the uncommitted span with chunks of backing memory. */
+         while (span_va_page < va_page) {
+            struct amdgpu_sparse_backing *backing;
+            uint32_t backing_start, backing_size;
+
+            backing_size = va_page - span_va_page;
+            backing = sparse_backing_alloc(bo, &backing_start, &backing_size);
+            if (!backing) {
+               ok = false;
+               goto out;
+            }
+
+            r = amdgpu_bo_va_op_raw(bo->ws->dev, backing->bo->bo,
+                                    (uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE,
+                                    (uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE,
+                                    bo->va + (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE,
+                                    AMDGPU_VM_PAGE_READABLE |
+                                    AMDGPU_VM_PAGE_WRITEABLE |
+                                    AMDGPU_VM_PAGE_EXECUTABLE,
+                                    AMDGPU_VA_OP_REPLACE);
+            if (r) {
+               ok = sparse_backing_free(bo, backing, backing_start, backing_size);
+               assert(ok && "sufficient memory should already be allocated");
+
+               ok = false;
+               goto out;
+            }
+
+            while (backing_size) {
+               comm[span_va_page].backing = backing;
+               comm[span_va_page].page = backing_start;
+               span_va_page++;
+               backing_start++;
+               backing_size--;
+            }
+         }
+      }
+   } else {
+      r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0,
+                              (uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE,
+                              bo->va + (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE,
+                              AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE);
+      if (r) {
+         ok = false;
+         goto out;
+      }
+
+      while (va_page < end_va_page) {
+         struct amdgpu_sparse_backing *backing;
+         uint32_t backing_start;
+         uint32_t span_pages;
+
+         /* Skip pages that are already uncommitted. */
+         if (!comm[va_page].backing) {
+            va_page++;
+            continue;
+         }
+
+         /* Group contiguous spans of pages. */
+         backing = comm[va_page].backing;
+         backing_start = comm[va_page].page;
+         comm[va_page].backing = NULL;
+
+         span_pages = 1;
+         va_page++;
+
+         while (va_page < end_va_page &&
+                comm[va_page].backing == backing &&
+                comm[va_page].page == backing_start + span_pages) {
+            comm[va_page].backing = NULL;
+            va_page++;
+            span_pages++;
+         }
+
+         if (!sparse_backing_free(bo, backing, backing_start, span_pages)) {
+            /* Couldn't allocate tracking data structures, so we have to leak */
+            fprintf(stderr, "amdgpu: leaking PRT backing memory\n");
+            ok = false;
+         }
+      }
+   }
+out:
+
+   mtx_unlock(&bo->u.sparse.commit_lock);
+
+   return ok;
+}
+
 static unsigned eg_tile_split(unsigned tile_split)
 {
    switch (tile_split) {
@@ -696,7 +1085,7 @@ amdgpu_bo_create(struct radeon_winsys *rws,
    unsigned usage = 0, pb_cache_bucket;
 
    /* Sub-allocate small buffers from slabs. */
-   if (!(flags & RADEON_FLAG_HANDLE) &&
+   if (!(flags & (RADEON_FLAG_HANDLE | RADEON_FLAG_SPARSE)) &&
        size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
        alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) {
       struct pb_slab_entry *entry;
@@ -742,6 +1131,15 @@ amdgpu_bo_create(struct radeon_winsys *rws,
    }
 no_slab:
 
+   if (flags & RADEON_FLAG_SPARSE) {
+      assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0);
+      assert(!(flags & RADEON_FLAG_CPU_ACCESS));
+
+      flags |= RADEON_FLAG_NO_CPU_ACCESS;
+
+      return amdgpu_bo_sparse_create(ws, size, domain, flags);
+   }
+
    /* This flag is irrelevant for the cache. */
    flags &= ~RADEON_FLAG_HANDLE;
 
@@ -1003,6 +1401,7 @@ void amdgpu_bo_init_functions(struct amdgpu_winsys *ws)
    ws->base.buffer_from_ptr = amdgpu_bo_from_ptr;
    ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr;
    ws->base.buffer_get_handle = amdgpu_bo_get_handle;
+   ws->base.buffer_commit = amdgpu_bo_sparse_commit;
    ws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
    ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;
 }
-- 
2.30.2