panfrost: Implement index buffer cache
authorAlyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Wed, 19 Feb 2020 15:32:20 +0000 (10:32 -0500)
committerMarge Bot <eric+marge@anholt.net>
Thu, 27 Feb 2020 10:30:48 +0000 (10:30 +0000)
For index bufer resources (not user index buffers), we're able to cache
results. In practice, the cache works pretty dang well. It's still
important that the min/max computation is efficient (since when the
cache misses it'll run at draw-time and we don't want jank), but this
can eliminate a lot of computations entirely.

We use a custom data structure for caching. Search is O(N) to the size
but sizes are capped so it's effectively O(1). Insertion is O(1) with
automatic oldest eviction, on the assumption that the oldest results are
the least likely to still be useful. We might also experiment with other
heuristics based on actual usage later.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Tested-by: Marge Bot <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3880>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3880>

src/gallium/drivers/panfrost/pan_context.c
src/gallium/drivers/panfrost/pan_resource.c
src/gallium/drivers/panfrost/pan_resource.h

index 0381e4104373cc655f522f063eec43bf7a1dafea..598009ba1c078a9c4a90b76e9ef93759f3688e1c 100644 (file)
@@ -1274,6 +1274,8 @@ panfrost_get_index_buffer_bounded(struct panfrost_context *ctx, const struct pip
                 needs_indices = false;
         }
 
+        uint64_t ht_key = 0;
+
         if (!info->has_user_indices) {
                 /* Only resources can be directly mapped */
                 panfrost_batch_add_bo(batch, rsrc->bo,
@@ -1281,6 +1283,24 @@ panfrost_get_index_buffer_bounded(struct panfrost_context *ctx, const struct pip
                                       PAN_BO_ACCESS_READ |
                                       PAN_BO_ACCESS_VERTEX_TILER);
                 out = rsrc->bo->gpu + offset;
+
+                /* Check the cache */
+                if (rsrc->index_cache) {
+                        ht_key = (((uint64_t) info->count) << 32) | info->start;
+
+                        struct panfrost_minmax_cache *cache = rsrc->index_cache;
+
+                        for (unsigned i = 0; i < cache->size; ++i) {
+                                if (cache->keys[i] == ht_key) {
+                                        uint64_t hit = cache->values[i];
+
+                                        *min_index = hit & 0xffffffff;
+                                        *max_index = hit >> 32;
+                                        needs_indices = false;
+                                        break;
+                                }
+                        }
+                }
         } else {
                 /* Otherwise, we need to upload to transient memory */
                 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
@@ -1290,8 +1310,25 @@ panfrost_get_index_buffer_bounded(struct panfrost_context *ctx, const struct pip
         if (needs_indices) {
                 /* Fallback */
                 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
+
+                if (!info->has_user_indices && rsrc->index_cache) {
+                        struct panfrost_minmax_cache *cache = rsrc->index_cache;
+                        uint64_t value = (*min_index) | (((uint64_t) *max_index) << 32);
+                        unsigned index = 0;
+
+                        if (cache->size == PANFROST_MINMAX_SIZE) {
+                                index = cache->index++;
+                                cache->index = cache->index % PANFROST_MINMAX_SIZE;
+                        } else {
+                                index = cache->size++;
+                        }
+
+                        cache->keys[index] =  ht_key;
+                        cache->values[index] = value;
+                }
         }
 
+
         return out;
 }
 
index d4c134f6246166c5c1abb7c4ab98a58549459692..b9d3cf31e3a2d163df8a5b9a1565a7c76d4c9a28 100644 (file)
@@ -513,6 +513,9 @@ panfrost_resource_create(struct pipe_screen *screen,
         panfrost_resource_create_bo(pscreen, so);
         panfrost_resource_reset_damage(so);
 
+        if (template->bind & PIPE_BIND_INDEX_BUFFER)
+                so->index_cache = rzalloc(so, struct panfrost_minmax_cache);
+
         return (struct pipe_resource *)so;
 }
 
@@ -533,6 +536,40 @@ panfrost_resource_destroy(struct pipe_screen *screen,
         ralloc_free(rsrc);
 }
 
+/* If we've been caching min/max indices and we update the index
+ * buffer, that may invalidate the min/max. Check what's been cached vs
+ * what we've written, and throw out invalid entries. */
+
+static void
+panfrost_invalidate_index_cache(struct panfrost_resource *rsrc, struct pipe_transfer *transfer)
+{
+        struct panfrost_minmax_cache *cache = rsrc->index_cache;
+
+        /* Ensure there is a cache to invalidate and a write */
+        if (!rsrc->index_cache) return;
+        if (!(transfer->usage & PIPE_TRANSFER_WRITE)) return;
+
+        unsigned valid_count = 0;
+
+        for (unsigned i = 0; i < cache->size; ++i) {
+                uint64_t key = cache->keys[i];
+
+                uint32_t start = key & 0xffffffff;
+                uint32_t count = key >> 32;
+
+                /* 1D range intersection */
+                bool invalid = MAX2(transfer->box.x, start) < MIN2(transfer->box.x + transfer->box.width, start + count);
+                if (!invalid) {
+                        cache->keys[valid_count] = key;
+                        cache->values[valid_count] = cache->values[i];
+                        valid_count++;
+                }
+        }
+
+        cache->size = valid_count;
+        cache->index = 0;
+}
+
 static void *
 panfrost_transfer_map(struct pipe_context *pctx,
                       struct pipe_resource *resource,
@@ -635,6 +672,15 @@ panfrost_transfer_map(struct pipe_context *pctx,
 
                 return transfer->map;
         } else {
+                /* Direct, persistent writes create holes in time for
+                 * caching... I don't know if this is actually possible but we
+                 * should still get it right */
+
+                unsigned dpw = PIPE_TRANSFER_MAP_DIRECTLY | PIPE_TRANSFER_WRITE | PIPE_TRANSFER_PERSISTENT;
+
+                if ((usage & dpw) == dpw && rsrc->index_cache)
+                        return NULL;
+
                 transfer->base.stride = rsrc->slices[level].stride;
                 transfer->base.layer_stride = panfrost_get_layer_stride(
                                 rsrc->slices, rsrc->base.target == PIPE_TEXTURE_3D,
@@ -643,8 +689,10 @@ panfrost_transfer_map(struct pipe_context *pctx,
                 /* By mapping direct-write, we're implicitly already
                  * initialized (maybe), so be conservative */
 
-                if ((usage & PIPE_TRANSFER_WRITE) && (usage & PIPE_TRANSFER_MAP_DIRECTLY))
+                if ((usage & PIPE_TRANSFER_WRITE) && (usage & PIPE_TRANSFER_MAP_DIRECTLY)) {
                         rsrc->slices[level].initialized = true;
+                        panfrost_invalidate_index_cache(rsrc, &transfer->base);
+                }
 
                 return bo->cpu
                        + rsrc->slices[level].offset
@@ -693,6 +741,8 @@ panfrost_transfer_unmap(struct pipe_context *pctx,
                        transfer->box.x,
                        transfer->box.x + transfer->box.width);
 
+        panfrost_invalidate_index_cache(prsrc, transfer);
+
         /* Derefence the resource */
         pipe_resource_reference(&transfer->resource, NULL);
 
index 7173526023ff44a3864eaeef0bd0977e1c8ac3f6..2728c7f0aebb0f3b0a403c7bcaf0d69cb5b67fdf 100644 (file)
 #include "drm-uapi/drm.h"
 #include "util/u_range.h"
 
+/* Index buffer min/max cache. We need to caclculate the min/max for arbitrary
+ * slices (start, start + count) of the index buffer at drawtime. As this can
+ * be quite expensive, we cache. Conceptually, we just use a hash table mapping
+ * the key (start, count) to the value (min, max). In practice, mesa's hash
+ * table implementation is higher overhead than we would like and makes
+ * handling memory usage a little complicated. So we use this data structure
+ * instead. Searching is O(n) to the size, but the size is capped at the
+ * PANFROST_MINMAX_SIZE constant (so this is a tradeoff between cache hit/miss
+ * ratio and cache search speed). Note that keys are adjacent so we get cache
+ * line alignment benefits. Insertion is O(1) and in-order until the cache
+ * fills up, after that it evicts the oldest cached value in a ring facilitated
+ * by index.
+ */
+
+#define PANFROST_MINMAX_SIZE 64
+
+struct panfrost_minmax_cache {
+        uint64_t keys[PANFROST_MINMAX_SIZE];
+        uint64_t values[PANFROST_MINMAX_SIZE];
+        unsigned size;
+        unsigned index;
+};
+
 struct panfrost_resource {
         struct pipe_resource base;
         struct {
@@ -60,6 +83,9 @@ struct panfrost_resource {
         bool checksummed;
 
         enum pipe_format internal_format;
+
+        /* Cached min/max values for index buffers */
+        struct panfrost_minmax_cache *index_cache;
 };
 
 static inline struct panfrost_resource *