From e4aa8338c30de2f99de86bcfb1a3a39a21cea5e9 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 3 Apr 2018 01:40:23 -0700
Subject: [PATCH] iris: Soft-pin the universe

Breaks everything, woo!
---
 src/gallium/drivers/iris/iris_batch.c    |   4 +-
 src/gallium/drivers/iris/iris_bufmgr.c   | 237 +++++++++++++++++++++--
 src/gallium/drivers/iris/iris_bufmgr.h   |  43 ++--
 src/gallium/drivers/iris/iris_resource.c |  17 +-
 src/gallium/drivers/iris/iris_screen.c   |   3 -
 src/gallium/drivers/iris/iris_screen.h   |   2 -
 6 files changed, 251 insertions(+), 55 deletions(-)
diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c
index 7a5c3df3a2e..72ede9f53da 100644
--- a/src/gallium/drivers/iris/iris_batch.c
+++ b/src/gallium/drivers/iris/iris_batch.c
@@ -95,7 +95,7 @@ create_batch_buffer(struct iris_bufmgr *bufmgr,
                     struct iris_batch_buffer *buf,
                     const char *name, unsigned size)
 {
-   buf->bo = iris_bo_alloc(bufmgr, name, size);
+   buf->bo = iris_bo_alloc(bufmgr, name, size, IRIS_MEMZONE_OTHER);
    buf->bo->kflags |= EXEC_OBJECT_CAPTURE;
    buf->map = iris_bo_map(NULL, buf->bo, MAP_READ | MAP_WRITE);
    buf->map_next = buf->map;
@@ -299,7 +299,7 @@ grow_buffer(struct iris_batch *batch,
    const unsigned existing_bytes = buffer_bytes_used(buf);
 
    struct iris_bo *new_bo =
-      iris_bo_alloc(bufmgr, bo->name, new_size);
+      iris_bo_alloc(bufmgr, bo->name, new_size, IRIS_MEMZONE_OTHER);
 
    buf->map = iris_bo_map(NULL, new_bo, MAP_READ | MAP_WRITE);
    buf->map_next = buf->map + existing_bytes;
diff --git a/src/gallium/drivers/iris/iris_bufmgr.c b/src/gallium/drivers/iris/iris_bufmgr.c
index 9bc101ee5ab..80e620b19e2 100644
--- a/src/gallium/drivers/iris/iris_bufmgr.c
+++ b/src/gallium/drivers/iris/iris_bufmgr.c
@@ -51,6 +51,8 @@
 #include "util/macros.h"
 #include "util/hash_table.h"
 #include "util/list.h"
+#include "util/u_dynarray.h"
+#include "util/vma.h"
 #include "iris_bufmgr.h"
 #include "iris_context.h"
 #include "string.h"
@@ -93,8 +95,6 @@ drm_ioctl(int fd, unsigned long request, void *arg)
     return ret;
 }
 
-
-
 static inline int
 atomic_add_unless(int *v, int add, int unless)
 {
@@ -105,9 +105,37 @@ atomic_add_unless(int *v, int add, int unless)
    return c == unless;
 }
 
+/*
+ * Idea:
+ *
+ * Have a bitmap-allocator for each BO cache bucket size.  Because bo_alloc
+ * rounds up allocations to the bucket size anyway, we can make 1 bit in the
+ * bitmap represent N pages of memory, where N = <bucket size / page size>.
+ * Allocations and frees always set/unset a single bit.  Because ffsll only
+ * works on uint64_t, use a tree(?) of those.
+ *
+ * Nodes contain a starting address and a uint64_t bitmap.  (pair-of-uint64_t)
+ * Bitmap uses 1 for a free block, 0 for in-use.
+ *
+ * Bucket contains...
+ *
+ *     Dynamic array of nodes.  (pointer, two ints)
+ */
+
+struct vma_bucket_node {
+   uint64_t start_address;
+   uint64_t bitmap;
+};
+
 struct bo_cache_bucket {
+   /** List of cached BOs. */
    struct list_head head;
+
+   /** Size of this bucket, in bytes. */
    uint64_t size;
+
+   /** List of vma_bucket_nodes */
+   struct util_dynarray vma_list[IRIS_MEMZONE_COUNT];
 };
 
 struct iris_bufmgr {
@@ -123,6 +151,8 @@ struct iris_bufmgr {
    struct hash_table *name_table;
    struct hash_table *handle_table;
 
+   struct util_vma_heap vma_allocator[IRIS_MEMZONE_COUNT];
+
    bool has_llc:1;
    bool bo_reuse:1;
 };
@@ -132,6 +162,10 @@ static int bo_set_tiling_internal(struct iris_bo *bo, uint32_t tiling_mode,
 
 static void bo_free(struct iris_bo *bo);
 
+static uint64_t vma_alloc(struct iris_bufmgr *bufmgr,
+                          enum iris_memory_zone memzone,
+                          uint64_t size, uint64_t alignment);
+
 static uint32_t
 key_hash_uint(const void *key)
 {
@@ -191,6 +225,141 @@ bucket_for_size(struct iris_bufmgr *bufmgr, uint64_t size)
           &bufmgr->cache_bucket[index] : NULL;
 }
 
+static enum iris_memory_zone
+memzone_for_address(uint64_t address)
+{
+   const uint64_t _4GB = 1ull << 32;
+
+   if (address >= 3 * _4GB)
+      return IRIS_MEMZONE_OTHER;
+
+   if (address >= 2 * _4GB)
+      return IRIS_MEMZONE_DYNAMIC;
+
+   if (address >= 1 * _4GB)
+      return IRIS_MEMZONE_SURFACE;
+
+   return IRIS_MEMZONE_SHADER;
+}
+
+static uint64_t
+bucket_vma_alloc(struct iris_bufmgr *bufmgr,
+                 struct bo_cache_bucket *bucket,
+                 enum iris_memory_zone memzone)
+{
+   struct util_dynarray *vma_list = &bucket->vma_list[memzone];
+   struct vma_bucket_node *node;
+
+   if (vma_list->size == 0) {
+      /* This bucket allocator is out of space - allocate a new block of
+       * memory from a larger allocator (either another bucket or util_vma).
+       *
+       * Set the first bit used, and return the start address.
+       */
+      node = util_dynarray_grow(vma_list, sizeof(struct vma_bucket_node));
+      node->start_address =
+         vma_alloc(bufmgr, memzone, 64ull * bucket->size, bucket->size);
+      node->bitmap = ~1ull;
+      return node->start_address;
+   }
+
+   /* Pick any bit from any node - they're all the right size and free. */
+   node = util_dynarray_top_ptr(vma_list, struct vma_bucket_node);
+   int bit = ffsll(node->bitmap) - 1;
+   assert(bit != -1);
+
+   /* Reserve the memory by clearing the bit. */
+   node->bitmap &= ~(1ull << bit);
+
+   /* If this node is now completely full, remove it from the free list. */
+   if (node->bitmap == 0ull) {
+      (void) util_dynarray_pop(vma_list, struct vma_bucket_node);
+   }
+
+   return node->start_address + bit * bucket->size;
+}
+
+static void
+bucket_vma_free(struct bo_cache_bucket *bucket,
+                uint64_t address,
+                uint64_t size)
+{
+   enum iris_memory_zone memzone = memzone_for_address(address);
+   struct util_dynarray *vma_list = &bucket->vma_list[memzone];
+   const uint64_t node_bytes = 64ull * bucket->size;
+   struct vma_bucket_node *node = NULL;
+
+   uint64_t start = (address / node_bytes) * node_bytes;
+   int bit = (address - start) / bucket->size;
+
+   util_dynarray_foreach(vma_list, struct vma_bucket_node, cur) {
+      if (cur->start_address == start) {
+         node = cur;
+         break;
+      }
+   }
+
+   if (!node) {
+      node = util_dynarray_grow(vma_list, sizeof(struct vma_bucket_node));
+      node->start_address = start;
+      node->bitmap = 0ull;
+   }
+
+   node->bitmap |= 1ull << bit;
+
+   /* The block might be entirely free now, and if so, we could return it
+    * to the larger allocator.  But we may as well hang on to it, in case
+    * we get more allocations at this block size.
+    */
+}
+
+static struct bo_cache_bucket *
+get_bucket_allocator(struct iris_bufmgr *bufmgr, uint64_t size)
+{
+   /* Skip using the bucket allocator for very large sizes, as it allocates
+    * 64 of them and this can balloon rather quickly.
+    */
+   if (size > 1024 * PAGE_SIZE)
+      return NULL;
+
+   struct bo_cache_bucket *bucket = bucket_for_size(bufmgr, size);
+
+   if (bucket && bucket->size == size)
+      return bucket;
+
+   return NULL;
+}
+
+static uint64_t
+vma_alloc(struct iris_bufmgr *bufmgr,
+          enum iris_memory_zone memzone,
+          uint64_t size,
+          uint64_t alignment)
+{
+   struct bo_cache_bucket *bucket = get_bucket_allocator(bufmgr, size);
+
+   if (bucket)
+      return bucket_vma_alloc(bufmgr, bucket, memzone);
+
+   return util_vma_heap_alloc(&bufmgr->vma_allocator[memzone], size,
+                              alignment);
+}
+
+static void
+vma_free(struct iris_bufmgr *bufmgr,
+         uint64_t address,
+         uint64_t size)
+{
+   struct bo_cache_bucket *bucket = get_bucket_allocator(bufmgr, size);
+
+   if (bucket) {
+      bucket_vma_free(bucket, address, size);
+   } else {
+      enum iris_memory_zone memzone = memzone_for_address(address);
+      util_vma_heap_free(&bufmgr->vma_allocator[memzone], address, size);
+   }
+}
+
 int
 iris_bo_busy(struct iris_bo *bo)
 {
@@ -237,6 +406,7 @@ static struct iris_bo *
 bo_alloc_internal(struct iris_bufmgr *bufmgr,
                   const char *name,
                   uint64_t size,
+                  enum iris_memory_zone memzone,
                   unsigned flags,
                   uint32_t tiling_mode,
                   uint32_t stride)
@@ -303,7 +473,15 @@ retry:
       }
    }
 
-   if (!alloc_from_cache) {
+   if (alloc_from_cache) {
+      /* If the cached BO isn't in the right memory zone, free the old
+       * memory and assign it a new address.
+       */
+      if (memzone != memzone_for_address(bo->gtt_offset)) {
+         vma_free(bufmgr, bo->gtt_offset, size);
+         bo->gtt_offset = 0;
+      }
+   } else {
       bo = calloc(1, sizeof(*bo));
       if (!bo)
          goto err;
@@ -325,6 +503,7 @@ retry:
       bo->gem_handle = create.handle;
 
       bo->bufmgr = bufmgr;
+      bo->kflags = EXEC_OBJECT_PINNED;
 
       bo->tiling_mode = I915_TILING_NONE;
       bo->swizzle_mode = I915_BIT_6_SWIZZLE_NONE;
@@ -347,6 +526,13 @@ retry:
          goto err_free;
    }
 
+   if (bo->gtt_offset == 0ull) {
+      bo->gtt_offset = vma_alloc(bufmgr, memzone, bo->size, 1);
+
+      if (bo->gtt_offset == 0ull)
+         goto err_free;
+   }
+
    bo->name = name;
    p_atomic_set(&bo->refcount, 1);
    bo->reusable = true;
@@ -370,17 +556,20 @@ err:
 struct iris_bo *
 iris_bo_alloc(struct iris_bufmgr *bufmgr,
               const char *name,
-              uint64_t size)
+              uint64_t size,
+              enum iris_memory_zone memzone)
 {
-   return bo_alloc_internal(bufmgr, name, size, 0, I915_TILING_NONE, 0);
+   return bo_alloc_internal(bufmgr, name, size, memzone,
+                            0, I915_TILING_NONE, 0);
 }
 
 struct iris_bo *
 iris_bo_alloc_tiled(struct iris_bufmgr *bufmgr, const char *name,
-                    uint64_t size, uint32_t tiling_mode, uint32_t pitch,
-                    unsigned flags)
+                    uint64_t size, enum iris_memory_zone memzone,
+                    uint32_t tiling_mode, uint32_t pitch, unsigned flags)
 {
-   return bo_alloc_internal(bufmgr, name, size, flags, tiling_mode, pitch);
+   return bo_alloc_internal(bufmgr, name, size, memzone,
+                            flags, tiling_mode, pitch);
 }
 
 /**
@@ -435,11 +624,13 @@ iris_bo_gem_create_from_name(struct iris_bufmgr *bufmgr,
    bo->size = open_arg.size;
    bo->gtt_offset = 0;
    bo->bufmgr = bufmgr;
+   bo->kflags = EXEC_OBJECT_PINNED;
    bo->gem_handle = open_arg.handle;
    bo->name = name;
    bo->global_name = handle;
    bo->reusable = false;
    bo->external = true;
+   bo->gtt_offset = vma_alloc(bufmgr, IRIS_MEMZONE_OTHER, bo->size, 1);
 
    _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo);
    _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo);
@@ -494,6 +685,8 @@ bo_free(struct iris_bo *bo)
       _mesa_hash_table_remove(bufmgr->handle_table, entry);
    }
 
+   vma_free(bo->bufmgr, bo->gtt_offset, bo->size);
+
    /* Close this object */
    struct drm_gem_close close = { .handle = bo->gem_handle };
    int ret = drm_ioctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &close);
@@ -542,9 +735,7 @@ bo_unreference_final(struct iris_bo *bo, time_t time)
    if (bufmgr->bo_reuse && bo->reusable && bucket != NULL &&
        iris_bo_madvise(bo, I915_MADV_DONTNEED)) {
       bo->free_time = time;
-
       bo->name = NULL;
-      bo->kflags = 0;
 
       list_addtail(&bo->head, &bucket->head);
    } else {
@@ -960,6 +1151,9 @@ iris_bufmgr_destroy(struct iris_bufmgr *bufmgr)
 
          bo_free(bo);
       }
+
+      for (int i = 0; i < IRIS_MEMZONE_COUNT; i++)
+         util_dynarray_fini(&bucket->vma_list[i]);
    }
 
    _mesa_hash_table_destroy(bufmgr->name_table, NULL);
@@ -1052,6 +1246,7 @@ iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd)
       bo->size = ret;
 
    bo->bufmgr = bufmgr;
+   bo->kflags = EXEC_OBJECT_PINNED;
 
    bo->gem_handle = handle;
    _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo);
@@ -1059,6 +1254,7 @@ iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd)
    bo->name = "prime";
    bo->reusable = false;
    bo->external = true;
+   bo->gtt_offset = vma_alloc(bufmgr, IRIS_MEMZONE_OTHER, bo->size, 1);
 
    struct drm_i915_gem_get_tiling get_tiling = { .handle = bo->gem_handle };
    if (drm_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling))
@@ -1164,6 +1360,8 @@ add_bucket(struct iris_bufmgr *bufmgr, int size)
    assert(i < ARRAY_SIZE(bufmgr->cache_bucket));
 
    list_inithead(&bufmgr->cache_bucket[i].head);
+   for (int i = 0; i < IRIS_MEMZONE_COUNT; i++)
+      util_dynarray_init(&bufmgr->cache_bucket[i].vma_list[i], NULL);
    bufmgr->cache_bucket[i].size = size;
    bufmgr->num_buckets++;
 
@@ -1185,12 +1383,12 @@ init_cache_buckets(struct iris_bufmgr *bufmgr)
     * width/height alignment and rounding of sizes to pages will
     * get us useful cache hit rates anyway)
     */
-   add_bucket(bufmgr, 4096);
-   add_bucket(bufmgr, 4096 * 2);
-   add_bucket(bufmgr, 4096 * 3);
+   add_bucket(bufmgr, PAGE_SIZE);
+   add_bucket(bufmgr, PAGE_SIZE * 2);
+   add_bucket(bufmgr, PAGE_SIZE * 3);
 
    /* Initialize the linked lists for BO reuse cache. */
-   for (size = 4 * 4096; size <= cache_max_size; size *= 2) {
+   for (size = 4 * PAGE_SIZE; size <= cache_max_size; size *= 2) {
       add_bucket(bufmgr, size);
 
       add_bucket(bufmgr, size + size * 1 / 4);
@@ -1284,6 +1482,17 @@ iris_bufmgr_init(struct gen_device_info *devinfo, int fd)
 
    bufmgr->has_llc = devinfo->has_llc;
 
+   const uint64_t _4GB = 1ull << 32;
+
+   util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_SHADER],
+                      PAGE_SIZE, _4GB);
+   util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_SURFACE],
+                      1 * _4GB, _4GB);
+   util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_DYNAMIC],
+                      2 * _4GB, _4GB);
+   util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_OTHER],
+                      3 * _4GB, (1ull << 48) - 3 * _4GB);
+
    init_cache_buckets(bufmgr);
 
    bufmgr->name_table =
diff --git a/src/gallium/drivers/iris/iris_bufmgr.h b/src/gallium/drivers/iris/iris_bufmgr.h
index fa4df2a53df..3c52c2d8722 100644
--- a/src/gallium/drivers/iris/iris_bufmgr.h
+++ b/src/gallium/drivers/iris/iris_bufmgr.h
@@ -35,6 +35,15 @@
 struct gen_device_info;
 struct pipe_debug_callback;
 
+enum iris_memory_zone {
+   IRIS_MEMZONE_DYNAMIC,
+   IRIS_MEMZONE_SURFACE,
+   IRIS_MEMZONE_SHADER,
+   IRIS_MEMZONE_OTHER,
+};
+
+#define IRIS_MEMZONE_COUNT (IRIS_MEMZONE_OTHER + 1)
+
 struct iris_bo {
    /**
     * Size in bytes of the buffer object.
@@ -51,31 +60,11 @@ struct iris_bo {
    uint32_t gem_handle;
 
    /**
-    * Offset of the buffer inside the Graphics Translation Table.
-    *
-    * This is effectively our GPU address for the buffer and we use it
-    * as our base for all state pointers into the buffer. However, since the
-    * kernel may be forced to move it around during the course of the
-    * buffer's lifetime, we can only know where the buffer was on the last
-    * execbuf. We presume, and are usually right, that the buffer will not
-    * move and so we use that last offset for the next batch and by doing
-    * so we can avoid having the kernel perform a relocation fixup pass as
-    * our pointers inside the batch will be using the correct base offset.
-    *
-    * Since we do use it as a base address for the next batch of pointers,
-    * the kernel treats our offset as a request, and if possible will
-    * arrange the buffer to placed at that address (trying to balance
-    * the cost of buffer migration versus the cost of performing
-    * relocations). Furthermore, we can force the kernel to place the buffer,
-    * or report a failure if we specified a conflicting offset, at our chosen
-    * offset by specifying EXEC_OBJECT_PINNED.
+    * Virtual address of the buffer inside the PPGTT (Per-Process Graphics
+    * Translation Table).
     *
-    * Note the GTT may be either per context, or shared globally across the
-    * system. On a shared system, our buffers have to contend for address
-    * space with both aperture mappings and framebuffers and so are more
-    * likely to be moved. On a full ppGTT system, each batch exists in its
-    * own GTT, and so each buffer may have their own offset within each
-    * context.
+    * Although each hardware context has its own VMA, we assign BO's to the
+    * same address in all contexts, for simplicity.
     */
    uint64_t gtt_offset;
 
@@ -156,7 +145,8 @@ struct iris_bo {
  */
 struct iris_bo *iris_bo_alloc(struct iris_bufmgr *bufmgr,
                               const char *name,
-                              uint64_t size);
+                              uint64_t size,
+                              enum iris_memory_zone memzone);
 
 /**
  * Allocate a tiled buffer object.
@@ -174,7 +164,8 @@ struct iris_bo *iris_bo_alloc_tiled(struct iris_bufmgr *bufmgr,
                                     uint64_t size,
                                     uint32_t tiling_mode,
                                     uint32_t pitch,
-                                    unsigned flags);
+                                    unsigned flags,
+                                    enum iris_memory_zone memzone);
 
 /** Takes a reference on a buffer object */
 static inline void
diff --git a/src/gallium/drivers/iris/iris_resource.c b/src/gallium/drivers/iris/iris_resource.c
index 93855741f84..92c9b038a3d 100644
--- a/src/gallium/drivers/iris/iris_resource.c
+++ b/src/gallium/drivers/iris/iris_resource.c
@@ -256,19 +256,20 @@ iris_resource_create_with_modifiers(struct pipe_screen *pscreen,
                  .usage = usage,
                  .tiling_flags = 1 << mod_info->tiling);
 
-   res->bo = iris_bo_alloc_tiled(screen->bufmgr, "resource", res->surf.size_B,
+   enum iris_memory_zone memzone = IRIS_MEMZONE_OTHER;
+   const char *name = "resource";
+   if (templ->flags & IRIS_RESOURCE_FLAG_INSTRUCTION_CACHE) {
+      memzone = IRIS_MEMZONE_SHADER;
+      name = "shader kernels";
+   }
+
+   res->bo = iris_bo_alloc_tiled(screen->bufmgr, name, res->surf.size_B,
+                                 IRIS_MEMZONE_OTHER,
                                  isl_tiling_to_i915_tiling(res->surf.tiling),
                                  res->surf.row_pitch_B, 0);
    if (!res->bo)
       goto fail;
 
-   if (templ->flags & IRIS_RESOURCE_FLAG_INSTRUCTION_CACHE) {
-      res->bo->kflags = EXEC_OBJECT_PINNED;
-      res->bo->name = "instruction cache";
-      // XXX: p_atomic_add is backwards :(
-      res->bo->gtt_offset = __atomic_fetch_add(&screen->next_instruction_address, res->bo->size, __ATOMIC_ACQ_REL);
-   }
-
    return &res->base;
 
 fail:
diff --git a/src/gallium/drivers/iris/iris_screen.c b/src/gallium/drivers/iris/iris_screen.c
index d6d125bb95b..79dba4c507c 100644
--- a/src/gallium/drivers/iris/iris_screen.c
+++ b/src/gallium/drivers/iris/iris_screen.c
@@ -526,8 +526,5 @@ iris_screen_create(int fd)
    pscreen->fence_finish = iris_fence_finish;
    pscreen->query_memory_info = iris_query_memory_info;
 
-   /* Put them somewhere non-zero */
-   screen->next_instruction_address = 128 * 1024;
-
    return pscreen;
 }
diff --git a/src/gallium/drivers/iris/iris_screen.h b/src/gallium/drivers/iris/iris_screen.h
index f24f567532f..5484e535ac9 100644
--- a/src/gallium/drivers/iris/iris_screen.h
+++ b/src/gallium/drivers/iris/iris_screen.h
@@ -47,8 +47,6 @@ struct iris_screen {
    struct isl_device isl_dev;
    struct iris_bufmgr *bufmgr;
    struct brw_compiler *compiler;
-
-   uint32_t next_instruction_address;
 };
 
 struct pipe_screen *iris_screen_create(int fd);
-- 
2.30.2