panfrost: Implement index buffer cache

author Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>

Wed, 19 Feb 2020 15:32:20 +0000 (10:32 -0500)

committer Marge Bot <eric+marge@anholt.net>

Thu, 27 Feb 2020 10:30:48 +0000 (10:30 +0000)
author Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Wed, 19 Feb 2020 15:32:20 +0000 (10:32 -0500)
committer Marge Bot <eric+marge@anholt.net>
Thu, 27 Feb 2020 10:30:48 +0000 (10:30 +0000)
diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c

index 0381e4104373cc655f522f063eec43bf7a1dafea..598009ba1c078a9c4a90b76e9ef93759f3688e1c 100644 (file)
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@@ -1274,6 +1274,8 @@ panfrost_get_index_buffer_bounded(struct panfrost_context *ctx, const struct pip
                  needs_indices = false;
          }
  
+        uint64_t ht_key = 0;
+
          if (!info->has_user_indices) {
                  /* Only resources can be directly mapped */
                  panfrost_batch_add_bo(batch, rsrc->bo,
@@ -1281,6 +1283,24 @@ panfrost_get_index_buffer_bounded(struct panfrost_context *ctx, const struct pip
                                        PAN_BO_ACCESS_READ |
                                        PAN_BO_ACCESS_VERTEX_TILER);
                  out = rsrc->bo->gpu + offset;
+
+                /* Check the cache */
+                if (rsrc->index_cache) {
+                        ht_key = (((uint64_t) info->count) << 32) | info->start;
+
+                        struct panfrost_minmax_cache *cache = rsrc->index_cache;
+
+                        for (unsigned i = 0; i < cache->size; ++i) {
+                                if (cache->keys[i] == ht_key) {
+                                        uint64_t hit = cache->values[i];
+
+                                        *min_index = hit & 0xffffffff;
+                                        *max_index = hit >> 32;
+                                        needs_indices = false;
+                                        break;
+                                }
+                        }
+                }
          } else {
                  /* Otherwise, we need to upload to transient memory */
                  const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
@@ -1290,8 +1310,25 @@ panfrost_get_index_buffer_bounded(struct panfrost_context *ctx, const struct pip
          if (needs_indices) {
                  /* Fallback */
                  u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
+
+                if (!info->has_user_indices && rsrc->index_cache) {
+                        struct panfrost_minmax_cache *cache = rsrc->index_cache;
+                        uint64_t value = (*min_index) | (((uint64_t) *max_index) << 32);
+                        unsigned index = 0;
+
+                        if (cache->size == PANFROST_MINMAX_SIZE) {
+                                index = cache->index++;
+                                cache->index = cache->index % PANFROST_MINMAX_SIZE;
+                        } else {
+                                index = cache->size++;
+                        }
+
+                        cache->keys[index] =  ht_key;
+                        cache->values[index] = value;
+                }
          }
  
+
          return out;
  }
  
diff --git a/src/gallium/drivers/panfrost/pan_resource.c b/src/gallium/drivers/panfrost/pan_resource.c

index d4c134f6246166c5c1abb7c4ab98a58549459692..b9d3cf31e3a2d163df8a5b9a1565a7c76d4c9a28 100644 (file)
--- a/src/gallium/drivers/panfrost/pan_resource.c
+++ b/src/gallium/drivers/panfrost/pan_resource.c
@@ -513,6 +513,9 @@ panfrost_resource_create(struct pipe_screen *screen,
          panfrost_resource_create_bo(pscreen, so);
          panfrost_resource_reset_damage(so);
  
+        if (template->bind & PIPE_BIND_INDEX_BUFFER)
+                so->index_cache = rzalloc(so, struct panfrost_minmax_cache);
+
          return (struct pipe_resource *)so;
  }
  
@@ -533,6 +536,40 @@ panfrost_resource_destroy(struct pipe_screen *screen,
          ralloc_free(rsrc);
  }
  
+/* If we've been caching min/max indices and we update the index
+ * buffer, that may invalidate the min/max. Check what's been cached vs
+ * what we've written, and throw out invalid entries. */
+
+static void
+panfrost_invalidate_index_cache(struct panfrost_resource *rsrc, struct pipe_transfer *transfer)
+{
+        struct panfrost_minmax_cache *cache = rsrc->index_cache;
+
+        /* Ensure there is a cache to invalidate and a write */
+        if (!rsrc->index_cache) return;
+        if (!(transfer->usage & PIPE_TRANSFER_WRITE)) return;
+
+        unsigned valid_count = 0;
+
+        for (unsigned i = 0; i < cache->size; ++i) {
+                uint64_t key = cache->keys[i];
+
+                uint32_t start = key & 0xffffffff;
+                uint32_t count = key >> 32;
+
+                /* 1D range intersection */
+                bool invalid = MAX2(transfer->box.x, start) < MIN2(transfer->box.x + transfer->box.width, start + count);
+                if (!invalid) {
+                        cache->keys[valid_count] = key;
+                        cache->values[valid_count] = cache->values[i];
+                        valid_count++;
+                }
+        }
+
+        cache->size = valid_count;
+        cache->index = 0;
+}
+
  static void *
  panfrost_transfer_map(struct pipe_context *pctx,
                        struct pipe_resource *resource,
@@ -635,6 +672,15 @@ panfrost_transfer_map(struct pipe_context *pctx,
  
                  return transfer->map;
          } else {
+                /* Direct, persistent writes create holes in time for
+                 * caching... I don't know if this is actually possible but we
+                 * should still get it right */
+
+                unsigned dpw = PIPE_TRANSFER_MAP_DIRECTLY | PIPE_TRANSFER_WRITE | PIPE_TRANSFER_PERSISTENT;
+
+                if ((usage & dpw) == dpw && rsrc->index_cache)
+                        return NULL;
+
                  transfer->base.stride = rsrc->slices[level].stride;
                  transfer->base.layer_stride = panfrost_get_layer_stride(
                                  rsrc->slices, rsrc->base.target == PIPE_TEXTURE_3D,
@@ -643,8 +689,10 @@ panfrost_transfer_map(struct pipe_context *pctx,
                  /* By mapping direct-write, we're implicitly already
                   * initialized (maybe), so be conservative */
  
-                if ((usage & PIPE_TRANSFER_WRITE) && (usage & PIPE_TRANSFER_MAP_DIRECTLY))
+                if ((usage & PIPE_TRANSFER_WRITE) && (usage & PIPE_TRANSFER_MAP_DIRECTLY)) {
                          rsrc->slices[level].initialized = true;
+                        panfrost_invalidate_index_cache(rsrc, &transfer->base);
+                }
  
                  return bo->cpu
                         + rsrc->slices[level].offset
@@ -693,6 +741,8 @@ panfrost_transfer_unmap(struct pipe_context *pctx,
                         transfer->box.x,
                         transfer->box.x + transfer->box.width);
  
+        panfrost_invalidate_index_cache(prsrc, transfer);
+
          /* Derefence the resource */
          pipe_resource_reference(&transfer->resource, NULL);
  
diff --git a/src/gallium/drivers/panfrost/pan_resource.h b/src/gallium/drivers/panfrost/pan_resource.h

index 7173526023ff44a3864eaeef0bd0977e1c8ac3f6..2728c7f0aebb0f3b0a403c7bcaf0d69cb5b67fdf 100644 (file)
--- a/src/gallium/drivers/panfrost/pan_resource.h
+++ b/src/gallium/drivers/panfrost/pan_resource.h
@@ -33,6 +33,29 @@
  #include "drm-uapi/drm.h"
  #include "util/u_range.h"
  
+/* Index buffer min/max cache. We need to caclculate the min/max for arbitrary
+ * slices (start, start + count) of the index buffer at drawtime. As this can
+ * be quite expensive, we cache. Conceptually, we just use a hash table mapping
+ * the key (start, count) to the value (min, max). In practice, mesa's hash
+ * table implementation is higher overhead than we would like and makes
+ * handling memory usage a little complicated. So we use this data structure
+ * instead. Searching is O(n) to the size, but the size is capped at the
+ * PANFROST_MINMAX_SIZE constant (so this is a tradeoff between cache hit/miss
+ * ratio and cache search speed). Note that keys are adjacent so we get cache
+ * line alignment benefits. Insertion is O(1) and in-order until the cache
+ * fills up, after that it evicts the oldest cached value in a ring facilitated
+ * by index.
+ */
+
+#define PANFROST_MINMAX_SIZE 64
+
+struct panfrost_minmax_cache {
+        uint64_t keys[PANFROST_MINMAX_SIZE];
+        uint64_t values[PANFROST_MINMAX_SIZE];
+        unsigned size;
+        unsigned index;
+};
+
  struct panfrost_resource {
          struct pipe_resource base;
          struct {
@@ -60,6 +83,9 @@ struct panfrost_resource {
          bool checksummed;
  
          enum pipe_format internal_format;
+
+        /* Cached min/max values for index buffers */
+        struct panfrost_minmax_cache *index_cache;
  };
  
  static inline struct panfrost_resource *
author	Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
	Wed, 19 Feb 2020 15:32:20 +0000 (10:32 -0500)
committer	Marge Bot <eric+marge@anholt.net>
	Thu, 27 Feb 2020 10:30:48 +0000 (10:30 +0000)
src/gallium/drivers/panfrost/pan_context.c		patch \| blob \| history
src/gallium/drivers/panfrost/pan_resource.c		patch \| blob \| history
src/gallium/drivers/panfrost/pan_resource.h		patch \| blob \| history