freedreno: add batch-cache and batch reordering
authorRob Clark <robdclark@gmail.com>
Mon, 27 Jun 2016 13:44:15 +0000 (09:44 -0400)
committerRob Clark <robdclark@gmail.com>
Sat, 30 Jul 2016 13:23:42 +0000 (09:23 -0400)
Note that I originally also had a entry-point that would construct a key
and do lookup from a pipe_surface.  I ended up not needing that (yet?)
but it is easy-enough to re-introduce later if we need it for the blit
path.

For now, not enabled by default, but can be enabled (on a3xx/a4xx) with
FD_MESA_DEBUG=reorder.

Signed-off-by: Rob Clark <robdclark@gmail.com>
15 files changed:
src/gallium/drivers/freedreno/Makefile.sources
src/gallium/drivers/freedreno/freedreno_batch.c
src/gallium/drivers/freedreno/freedreno_batch.h
src/gallium/drivers/freedreno/freedreno_batch_cache.c [new file with mode: 0644]
src/gallium/drivers/freedreno/freedreno_batch_cache.h [new file with mode: 0644]
src/gallium/drivers/freedreno/freedreno_context.c
src/gallium/drivers/freedreno/freedreno_context.h
src/gallium/drivers/freedreno/freedreno_draw.c
src/gallium/drivers/freedreno/freedreno_query_hw.c
src/gallium/drivers/freedreno/freedreno_resource.c
src/gallium/drivers/freedreno/freedreno_resource.h
src/gallium/drivers/freedreno/freedreno_screen.c
src/gallium/drivers/freedreno/freedreno_screen.h
src/gallium/drivers/freedreno/freedreno_state.c
src/gallium/drivers/freedreno/freedreno_util.h

index 4ba8c9dd19aebb8e992ad4625241a6131f6c2b22..92d9186597ced703b7bb23fd9a42a7a0792e2b03 100644 (file)
@@ -4,6 +4,8 @@ C_SOURCES := \
        disasm.h \
        freedreno_batch.c \
        freedreno_batch.h \
+       freedreno_batch_cache.c \
+       freedreno_batch_cache.h \
        freedreno_context.c \
        freedreno_context.h \
        freedreno_draw.c \
index 1fbce43f62cb34861161a78f933f4c9b244b574b..2dd7eda72adfd1f9a507e9a9bc35a5e665249372 100644 (file)
  */
 
 #include "util/list.h"
+#include "util/set.h"
+#include "util/hash_table.h"
 #include "util/u_string.h"
 
 #include "freedreno_batch.h"
 #include "freedreno_context.h"
 #include "freedreno_resource.h"
 
-struct fd_batch *
-fd_batch_create(struct fd_context *ctx)
+static void
+batch_init(struct fd_batch *batch)
 {
-       struct fd_batch *batch = CALLOC_STRUCT(fd_batch);
-       static unsigned seqno = 0;
+       struct fd_context *ctx = batch->ctx;
        unsigned size = 0;
 
-       if (!batch)
-               return NULL;
-
-       pipe_reference_init(&batch->reference, 1);
-       batch->seqno = ++seqno;
-       batch->ctx = ctx;
-
        /* if kernel is too old to support unlimited # of cmd buffers, we
         * have no option but to allocate large worst-case sizes so that
         * we don't need to grow the ringbuffer.  Performance is likely to
@@ -62,7 +56,11 @@ fd_batch_create(struct fd_context *ctx)
        fd_ringbuffer_set_parent(batch->draw, batch->gmem);
        fd_ringbuffer_set_parent(batch->binning, batch->gmem);
 
-       list_inithead(&batch->used_resources);
+       batch->cleared = batch->partial_cleared = 0;
+       batch->restore = batch->resolve = 0;
+       batch->needs_flush = false;
+       batch->gmem_reason = 0;
+       batch->num_draws = 0;
 
        /* reset maximal bounds: */
        batch->max_scissor.minx = batch->max_scissor.miny = ~0;
@@ -73,13 +71,33 @@ fd_batch_create(struct fd_context *ctx)
        if (is_a3xx(ctx->screen))
                util_dynarray_init(&batch->rbrc_patches);
 
+       assert(batch->resources->entries == 0);
+}
+
+struct fd_batch *
+fd_batch_create(struct fd_context *ctx)
+{
+       struct fd_batch *batch = CALLOC_STRUCT(fd_batch);
+
+       if (!batch)
+               return NULL;
+
+       DBG("%p", batch);
+
+       pipe_reference_init(&batch->reference, 1);
+       batch->ctx = ctx;
+
+       batch->resources = _mesa_set_create(NULL, _mesa_hash_pointer,
+                       _mesa_key_pointer_equal);
+
+       batch_init(batch);
+
        return batch;
 }
 
-void
-__fd_batch_destroy(struct fd_batch *batch)
+static void
+batch_fini(struct fd_batch *batch)
 {
-       util_copy_framebuffer_state(&batch->framebuffer, NULL);
        fd_ringbuffer_del(batch->draw);
        fd_ringbuffer_del(batch->binning);
        fd_ringbuffer_del(batch->gmem);
@@ -88,6 +106,74 @@ __fd_batch_destroy(struct fd_batch *batch)
 
        if (is_a3xx(batch->ctx->screen))
                util_dynarray_fini(&batch->rbrc_patches);
+}
+
+static void
+batch_flush_reset_dependencies(struct fd_batch *batch, bool flush)
+{
+       struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache;
+       struct fd_batch *dep;
+
+       foreach_batch(dep, cache, batch->dependents_mask) {
+               if (flush)
+                       fd_batch_flush(dep);
+               fd_batch_reference(&dep, NULL);
+       }
+
+       batch->dependents_mask = 0;
+}
+
+static void
+batch_reset_resources(struct fd_batch *batch)
+{
+       struct set_entry *entry;
+
+       set_foreach(batch->resources, entry) {
+               struct fd_resource *rsc = (struct fd_resource *)entry->key;
+               _mesa_set_remove(batch->resources, entry);
+               debug_assert(rsc->batch_mask & (1 << batch->idx));
+               rsc->batch_mask &= ~(1 << batch->idx);
+               if (rsc->write_batch == batch)
+                       fd_batch_reference(&rsc->write_batch, NULL);
+       }
+}
+
+static void
+batch_reset(struct fd_batch *batch)
+{
+       DBG("%p", batch);
+
+       batch_flush_reset_dependencies(batch, false);
+       batch_reset_resources(batch);
+
+       batch_fini(batch);
+       batch_init(batch);
+}
+
+void
+fd_batch_reset(struct fd_batch *batch)
+{
+       if (batch->needs_flush)
+               batch_reset(batch);
+}
+
+void
+__fd_batch_destroy(struct fd_batch *batch)
+{
+       fd_bc_invalidate_batch(batch, true);
+
+       DBG("%p", batch);
+
+       util_copy_framebuffer_state(&batch->framebuffer, NULL);
+
+       batch_fini(batch);
+
+       batch_reset_resources(batch);
+       debug_assert(batch->resources->entries == 0);
+       _mesa_set_destroy(batch->resources, NULL);
+
+       batch_flush_reset_dependencies(batch, false);
+       debug_assert(batch->dependents_mask == 0);
 
        free(batch);
 }
@@ -98,46 +184,125 @@ __fd_batch_describe(char* buf, const struct fd_batch *batch)
        util_sprintf(buf, "fd_batch<%u>", batch->seqno);
 }
 
-void
-fd_batch_flush(struct fd_batch *batch)
+static void
+batch_flush(struct fd_batch *batch)
 {
-       struct fd_resource *rsc, *rsc_tmp;
-
        DBG("%p: needs_flush=%d", batch, batch->needs_flush);
 
        if (!batch->needs_flush)
                return;
 
+       batch->needs_flush = false;
+
+       batch_flush_reset_dependencies(batch, true);
+
        fd_gmem_render_tiles(batch);
 
-       /* go through all the used resources and clear their reading flag */
-       LIST_FOR_EACH_ENTRY_SAFE(rsc, rsc_tmp, &batch->used_resources, list) {
-               debug_assert(rsc->pending_batch == batch);
-               debug_assert(rsc->status != 0);
-               rsc->status = 0;
-               fd_batch_reference(&rsc->pending_batch, NULL);
-               list_delinit(&rsc->list);
-       }
+       batch_reset_resources(batch);
+
+       debug_assert(batch->reference.count > 0);
 
-       assert(LIST_IS_EMPTY(&batch->used_resources));
+       if (batch == batch->ctx->batch) {
+               batch_reset(batch);
+       } else {
+               fd_bc_invalidate_batch(batch, false);
+       }
 }
 
 void
-fd_batch_resource_used(struct fd_batch *batch, struct fd_resource *rsc,
-               enum fd_resource_status status)
+fd_batch_flush(struct fd_batch *batch)
+{
+       /* NOTE: we need to hold an extra ref across the body of flush,
+        * since the last ref to this batch could be dropped when cleaning
+        * up used_resources
+        */
+       struct fd_batch *tmp = NULL;
+       fd_batch_reference(&tmp, batch);
+       batch_flush(tmp);
+       fd_batch_reference(&tmp, NULL);
+}
+
+/* does 'batch' depend directly or indirectly on 'other' ? */
+static bool
+batch_depends_on(struct fd_batch *batch, struct fd_batch *other)
+{
+       struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache;
+       struct fd_batch *dep;
+
+       if (batch->dependents_mask & (1 << other->idx))
+               return true;
+
+       foreach_batch(dep, cache, batch->dependents_mask)
+               if (batch_depends_on(batch, dep))
+                       return true;
+
+       return false;
+}
+
+static void
+batch_add_dep(struct fd_batch *batch, struct fd_batch *dep)
 {
-       rsc->status |= status;
+       if (batch->dependents_mask & (1 << dep->idx))
+               return;
 
+       /* if the new depedency already depends on us, we need to flush
+        * to avoid a loop in the dependency graph.
+        */
+       if (batch_depends_on(dep, batch)) {
+               DBG("%p: flush forced on %p!", batch, dep);
+               fd_batch_flush(dep);
+       } else {
+               struct fd_batch *other = NULL;
+               fd_batch_reference(&other, dep);
+               batch->dependents_mask |= (1 << dep->idx);
+               DBG("%p: added dependency on %p", batch, dep);
+       }
+}
+
+void
+fd_batch_resource_used(struct fd_batch *batch, struct fd_resource *rsc, bool write)
+{
        if (rsc->stencil)
-               rsc->stencil->status |= status;
+               fd_batch_resource_used(batch, rsc->stencil, write);
+
+       DBG("%p: %s %p", batch, write ? "write" : "read", rsc);
 
-       /* TODO resources can actually be shared across contexts,
-        * so I'm not sure a single list-head will do the trick?
+       /* note, invalidate write batch, to avoid further writes to rsc
+        * resulting in a write-after-read hazard.
         */
-       debug_assert((rsc->pending_batch == batch) || !rsc->pending_batch);
-       list_delinit(&rsc->list);
-       list_addtail(&rsc->list, &batch->used_resources);
-       fd_batch_reference(&rsc->pending_batch, batch);
+
+       if (write) {
+               /* if we are pending read or write by any other batch: */
+               if (rsc->batch_mask != (1 << batch->idx)) {
+                       struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache;
+                       struct fd_batch *dep;
+                       foreach_batch(dep, cache, rsc->batch_mask) {
+                               struct fd_batch *b = NULL;
+                               /* note that batch_add_dep could flush and unref dep, so
+                                * we need to hold a reference to keep it live for the
+                                * fd_bc_invalidate_batch()
+                                */
+                               fd_batch_reference(&b, dep);
+                               batch_add_dep(batch, b);
+                               fd_bc_invalidate_batch(b, false);
+                               fd_batch_reference_locked(&b, NULL);
+                       }
+               }
+               fd_batch_reference(&rsc->write_batch, batch);
+       } else {
+               if (rsc->write_batch) {
+                       batch_add_dep(batch, rsc->write_batch);
+                       fd_bc_invalidate_batch(rsc->write_batch, false);
+               }
+       }
+
+       if (rsc->batch_mask & (1 << batch->idx))
+               return;
+
+       debug_assert(!_mesa_set_search(batch->resources, rsc));
+
+       _mesa_set_add(batch->resources, rsc);
+       rsc->batch_mask |= (1 << batch->idx);
 }
 
 void
@@ -149,5 +314,5 @@ fd_batch_check_size(struct fd_batch *batch)
        struct fd_ringbuffer *ring = batch->draw;
        if (((ring->cur - ring->start) > (ring->size/4 - 0x1000)) ||
                        (fd_mesa_debug & FD_DBG_FLUSH))
-               fd_context_render(&batch->ctx->base);
+               fd_batch_flush(batch);
 }
index 4607250d3af7f8a8f1dd1f6cc46e087b6a9a4bcc..89d1d9fea7b8e86e13d58ea25e24886c4b392643 100644 (file)
@@ -42,6 +42,7 @@ enum fd_resource_status;
 struct fd_batch {
        struct pipe_reference reference;
        unsigned seqno;
+       unsigned idx;
 
        struct fd_context *ctx;
 
@@ -117,15 +118,24 @@ struct fd_batch {
        /** tiling/gmem (IB0) cmdstream: */
        struct fd_ringbuffer *gmem;
 
-       /** list of resources used by currently-unsubmitted batch */
-       struct list_head used_resources;
+       /* Set of resources used by currently-unsubmitted batch (read or
+        * write).. does not hold a reference to the resource.
+        */
+       struct set *resources;
+
+       /** key in batch-cache (if not null): */
+       const void *key;
+       uint32_t hash;
+
+       /** set of dependent batches.. holds refs to dependent batches: */
+       uint32_t dependents_mask;
 };
 
 struct fd_batch * fd_batch_create(struct fd_context *ctx);
 
+void fd_batch_reset(struct fd_batch *batch);
 void fd_batch_flush(struct fd_batch *batch);
-void fd_batch_resource_used(struct fd_batch *batch, struct fd_resource *rsc,
-               enum fd_resource_status status);
+void fd_batch_resource_used(struct fd_batch *batch, struct fd_resource *rsc, bool write);
 void fd_batch_check_size(struct fd_batch *batch);
 
 /* not called directly: */
diff --git a/src/gallium/drivers/freedreno/freedreno_batch_cache.c b/src/gallium/drivers/freedreno/freedreno_batch_cache.c
new file mode 100644 (file)
index 0000000..c947a55
--- /dev/null
@@ -0,0 +1,354 @@
+/*
+ * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "util/hash_table.h"
+#include "util/set.h"
+#include "util/list.h"
+#include "util/u_string.h"
+
+#include "freedreno_batch.h"
+#include "freedreno_batch_cache.h"
+#include "freedreno_context.h"
+#include "freedreno_resource.h"
+
+/* Overview:
+ *
+ *   The batch cache provides lookup for mapping pipe_framebuffer_state
+ *   to a batch.
+ *
+ *   It does this via hashtable, with key that roughly matches the
+ *   pipe_framebuffer_state, as described below.
+ *
+ * Batch Cache hashtable key:
+ *
+ *   To serialize the key, and to avoid dealing with holding a reference to
+ *   pipe_surface's (which hold a reference to pipe_resource and complicate
+ *   the whole refcnting thing), the key is variable length and inline's the
+ *   pertinent details of the pipe_surface.
+ *
+ * Batch:
+ *
+ *   Each batch needs to hold a reference to each resource it depends on (ie.
+ *   anything that needs a mem2gmem).  And a weak reference to resources it
+ *   renders to.  (If both src[n] and dst[n] are not NULL then they are the
+ *   same.)
+ *
+ *   When a resource is destroyed, we need to remove entries in the batch
+ *   cache that reference the resource, to avoid dangling pointer issues.
+ *   So each resource holds a hashset of batches which have reference them
+ *   in their hashtable key.
+ *
+ *   When a batch has weak reference to no more resources (ie. all the
+ *   surfaces it rendered to are destroyed) the batch can be destroyed.
+ *   Could happen in an app that renders and never uses the result.  More
+ *   common scenario, I think, will be that some, but not all, of the
+ *   surfaces are destroyed before the batch is submitted.
+ *
+ *   If (for example), batch writes to zsbuf but that surface is destroyed
+ *   before batch is submitted, we can skip gmem2mem (but still need to
+ *   alloc gmem space as before.  If the batch depended on previous contents
+ *   of that surface, it would be holding a reference so the surface would
+ *   not have been destroyed.
+ */
+
+struct key {
+       uint32_t width, height, layers;
+       uint16_t samples, num_surfs;
+       struct fd_context *ctx;
+       struct {
+               struct pipe_resource *texture;
+               union pipe_surface_desc u;
+               uint16_t pos, format;
+       } surf[0];
+};
+
+static struct key *
+key_alloc(unsigned num_surfs)
+{
+       struct key *key =
+               CALLOC_VARIANT_LENGTH_STRUCT(key, sizeof(key->surf[0]) * num_surfs);
+       return key;
+}
+
+static uint32_t
+key_hash(const void *_key)
+{
+       const struct key *key = _key;
+       uint32_t hash = _mesa_fnv32_1a_offset_bias;
+       hash = _mesa_fnv32_1a_accumulate_block(hash, key, offsetof(struct key, surf[0]));
+       hash = _mesa_fnv32_1a_accumulate_block(hash, key->surf, sizeof(key->surf[0]) * key->num_surfs);
+       return hash;
+}
+
+static bool
+key_equals(const void *_a, const void *_b)
+{
+       const struct key *a = _a;
+       const struct key *b = _b;
+       return (memcmp(a, b, offsetof(struct key, surf[0])) == 0) &&
+               (memcmp(a->surf, b->surf, sizeof(a->surf[0]) * a->num_surfs) == 0);
+}
+
+void
+fd_bc_init(struct fd_batch_cache *cache)
+{
+       cache->ht = _mesa_hash_table_create(NULL, key_hash, key_equals);
+}
+
+void
+fd_bc_fini(struct fd_batch_cache *cache)
+{
+       _mesa_hash_table_destroy(cache->ht, NULL);
+}
+
+uint32_t
+fd_bc_flush(struct fd_batch_cache *cache, struct fd_context *ctx)
+{
+       struct hash_entry *entry;
+       uint32_t timestamp = 0;
+
+       hash_table_foreach(cache->ht, entry) {
+               struct fd_batch *batch = NULL;
+               fd_batch_reference(&batch, (struct fd_batch *)entry->data);
+               if (batch->ctx == ctx) {
+                       fd_batch_flush(batch);
+                       timestamp = MAX2(timestamp, fd_ringbuffer_timestamp(batch->gmem));
+               }
+               fd_batch_reference(&batch, NULL);
+       }
+
+       return timestamp;
+}
+
+void
+fd_bc_invalidate_context(struct fd_context *ctx)
+{
+       struct fd_batch_cache *cache = &ctx->screen->batch_cache;
+       struct fd_batch *batch;
+
+       foreach_batch(batch, cache, cache->batch_mask) {
+               if (batch->ctx == ctx) {
+                       fd_batch_reset(batch);
+                       fd_batch_reference(&batch, NULL);
+               }
+       }
+}
+
+void
+fd_bc_invalidate_batch(struct fd_batch *batch, bool destroy)
+{
+       struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache;
+       struct key *key = (struct key *)batch->key;
+
+       if (destroy) {
+               cache->batches[batch->idx] = NULL;
+               cache->batch_mask &= ~(1 << batch->idx);
+       }
+
+       if (!key)
+               return;
+
+       DBG("%p: key=%p", batch, batch->key);
+       for (unsigned idx = 0; idx < key->num_surfs; idx++) {
+               struct fd_resource *rsc = fd_resource(key->surf[idx].texture);
+               rsc->bc_batch_mask &= ~(1 << batch->idx);
+       }
+
+       struct hash_entry *entry =
+               _mesa_hash_table_search_pre_hashed(cache->ht, batch->hash, key);
+       _mesa_hash_table_remove(cache->ht, entry);
+
+       batch->key = NULL;
+       free(key);
+}
+
+void
+fd_bc_invalidate_resource(struct fd_resource *rsc, bool destroy)
+{
+       struct fd_screen *screen = fd_screen(rsc->base.b.screen);
+               struct fd_batch *batch;
+
+       if (destroy) {
+               foreach_batch(batch, &screen->batch_cache, rsc->batch_mask) {
+                       struct set_entry *entry = _mesa_set_search(batch->resources, rsc);
+                       _mesa_set_remove(batch->resources, entry);
+               }
+               rsc->batch_mask = 0;
+
+               fd_batch_reference(&rsc->write_batch, NULL);
+       }
+
+       foreach_batch(batch, &screen->batch_cache, rsc->bc_batch_mask)
+               fd_bc_invalidate_batch(batch, false);
+
+       rsc->bc_batch_mask = 0;
+}
+
+struct fd_batch *
+fd_bc_alloc_batch(struct fd_batch_cache *cache, struct fd_context *ctx)
+{
+       struct fd_batch *batch;
+       uint32_t idx;
+
+       while ((idx = ffs(~cache->batch_mask)) == 0) {
+#if 0
+               for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) {
+                       batch = cache->batches[i];
+                       debug_printf("%d: needs_flush=%d, depends:", batch->idx, batch->needs_flush);
+                       struct set_entry *entry;
+                       set_foreach(batch->dependencies, entry) {
+                               struct fd_batch *dep = (struct fd_batch *)entry->key;
+                               debug_printf(" %d", dep->idx);
+                       }
+                       debug_printf("\n");
+               }
+#endif
+               /* TODO: is LRU the better policy?  Or perhaps the batch that
+                * depends on the fewest other batches?
+                */
+               struct fd_batch *flush_batch = NULL;
+               for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) {
+                       if ((cache->batches[i] == ctx->batch) ||
+                                       !cache->batches[i]->needs_flush)
+                               continue;
+                       if (!flush_batch || (cache->batches[i]->seqno < flush_batch->seqno))
+                               fd_batch_reference(&flush_batch, cache->batches[i]);
+               }
+               DBG("%p: too many batches!  flush forced!", flush_batch);
+               fd_batch_flush(flush_batch);
+
+               /* While the resources get cleaned up automatically, the flush_batch
+                * doesn't get removed from the dependencies of other batches, so
+                * it won't be unref'd and will remain in the table.
+                *
+                * TODO maybe keep a bitmask of batches that depend on me, to make
+                * this easier:
+                */
+               for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) {
+                       struct fd_batch *other = cache->batches[i];
+                       if (!other)
+                               continue;
+                       if (other->dependents_mask & (1 << flush_batch->idx)) {
+                               other->dependents_mask &= ~(1 << flush_batch->idx);
+                               struct fd_batch *ref = flush_batch;
+                               fd_batch_reference(&ref, NULL);
+                       }
+               }
+
+               fd_batch_reference(&flush_batch, NULL);
+       }
+
+       idx--;              /* bit zero returns 1 for ffs() */
+
+       batch = fd_batch_create(ctx);
+       if (!batch)
+               return NULL;
+
+       batch->seqno = cache->cnt++;
+       batch->idx = idx;
+       cache->batch_mask |= (1 << idx);
+
+       debug_assert(cache->batches[idx] == NULL);
+       cache->batches[idx] = batch;
+
+       return batch;
+}
+
+static struct fd_batch *
+batch_from_key(struct fd_batch_cache *cache, struct key *key,
+               struct fd_context *ctx)
+{
+       struct fd_batch *batch = NULL;
+       uint32_t hash = key_hash(key);
+       struct hash_entry *entry =
+               _mesa_hash_table_search_pre_hashed(cache->ht, hash, key);
+
+       if (entry) {
+               free(key);
+               fd_batch_reference(&batch, (struct fd_batch *)entry->data);
+               return batch;
+       }
+
+       batch = fd_bc_alloc_batch(cache, ctx);
+#ifdef DEBUG
+       DBG("%p: hash=0x%08x, %ux%u, %u layers, %u samples", batch, hash,
+                       key->width, key->height, key->layers, key->samples);
+       for (unsigned idx = 0; idx < key->num_surfs; idx++) {
+               DBG("%p:  surf[%u]: %p (%s) (%u,%u / %u,%u,%u)", batch, key->surf[idx].pos,
+                       key->surf[idx].texture, util_format_name(key->surf[idx].format),
+                       key->surf[idx].u.buf.first_element, key->surf[idx].u.buf.last_element,
+                       key->surf[idx].u.tex.first_layer, key->surf[idx].u.tex.last_layer,
+                       key->surf[idx].u.tex.level);
+       }
+#endif
+       if (!batch)
+               return NULL;
+
+       _mesa_hash_table_insert_pre_hashed(cache->ht, hash, key, batch);
+       batch->key = key;
+       batch->hash = hash;
+
+       for (unsigned idx = 0; idx < key->num_surfs; idx++) {
+               struct fd_resource *rsc = fd_resource(key->surf[idx].texture);
+               rsc->bc_batch_mask = (1 << batch->idx);
+       }
+
+       return batch;
+}
+
+static void
+key_surf(struct key *key, unsigned idx, unsigned pos, struct pipe_surface *psurf)
+{
+       key->surf[idx].texture = psurf->texture;
+       key->surf[idx].u = psurf->u;
+       key->surf[idx].pos = pos;
+       key->surf[idx].format = psurf->format;
+}
+
+struct fd_batch *
+fd_batch_from_fb(struct fd_batch_cache *cache, struct fd_context *ctx,
+               const struct pipe_framebuffer_state *pfb)
+{
+       unsigned idx = 0, n = pfb->nr_cbufs + (pfb->zsbuf ? 1 : 0);
+       struct key *key = key_alloc(n);
+
+       key->width = pfb->width;
+       key->height = pfb->height;
+       key->layers = pfb->layers;
+       key->samples = pfb->samples;
+       key->ctx = ctx;
+
+       if (pfb->zsbuf)
+               key_surf(key, idx++, 0, pfb->zsbuf);
+
+       for (unsigned i = 0; i < pfb->nr_cbufs; i++)
+               if (pfb->cbufs[i])
+                       key_surf(key, idx++, i + 1, pfb->cbufs[i]);
+
+       key->num_surfs = idx;
+
+       return batch_from_key(cache, key, ctx);
+}
diff --git a/src/gallium/drivers/freedreno/freedreno_batch_cache.h b/src/gallium/drivers/freedreno/freedreno_batch_cache.h
new file mode 100644 (file)
index 0000000..90500d5
--- /dev/null
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef FREEDRENO_BATCH_CACHE_H_
+#define FREEDRENO_BATCH_CACHE_H_
+
+#include "pipe/p_state.h"
+
+#include "freedreno_batch.h"
+
+struct hash_table;
+
+struct fd_batch_cache {
+       struct hash_table *ht;
+       unsigned cnt;
+
+       /* set of active batches.. there is an upper limit on the number of
+        * in-flight batches, for two reasons:
+        * 1) to avoid big spikes in number of batches in edge cases, such as
+        *    game startup (ie, lots of texture uploads, but no usages yet of
+        *    the textures), etc.
+        * 2) so we can use a simple bitmask in fd_resource to track which
+        *    batches have reference to the resource
+        */
+       struct fd_batch *batches[32];
+       uint32_t batch_mask;
+};
+
+/* note: if batches get unref'd in the body of the loop, they are removed
+ * from the various masks.. but since we copy the mask at the beginning of
+ * the loop into _m, we need the &= at the end of the loop to make sure
+ * we don't have stale bits in _m
+ */
+#define foreach_batch(batch, cache, mask) \
+       for (uint32_t _m = (mask); _m && ((batch) = (cache)->batches[u_bit_scan(&_m)]); _m &= (mask))
+
+void fd_bc_init(struct fd_batch_cache *cache);
+void fd_bc_fini(struct fd_batch_cache *cache);
+
+uint32_t fd_bc_flush(struct fd_batch_cache *cache, struct fd_context *ctx);
+
+void fd_bc_invalidate_context(struct fd_context *ctx);
+void fd_bc_invalidate_batch(struct fd_batch *batch, bool destroy);
+void fd_bc_invalidate_resource(struct fd_resource *rsc, bool destroy);
+struct fd_batch * fd_bc_alloc_batch(struct fd_batch_cache *cache, struct fd_context *ctx);
+
+struct fd_batch * fd_batch_from_fb(struct fd_batch_cache *cache,
+               struct fd_context *ctx, const struct pipe_framebuffer_state *pfb);
+
+#endif /* FREEDRENO_BATCH_CACHE_H_ */
index b9a1fe977457370edcc117f113b2d6e2fd047a19..e81d31175f0a931dc9da17305446b9972025328a 100644 (file)
 #include "freedreno_query_hw.h"
 #include "freedreno_util.h"
 
-/* emit accumulated render cmds, needed for example if render target has
- * changed, or for flush()
- */
-void
-fd_context_render(struct pipe_context *pctx)
-{
-       struct fd_context *ctx = fd_context(pctx);
-       struct fd_batch *new_batch;
-
-       fd_batch_flush(ctx->batch);
-
-       new_batch = fd_batch_create(ctx);
-       util_copy_framebuffer_state(&new_batch->framebuffer, &ctx->batch->framebuffer);
-       fd_batch_reference(&ctx->batch, NULL);
-       ctx->batch = new_batch;
-}
-
 static void
 fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
                unsigned flags)
 {
-       struct fd_batch *batch = NULL;
-
-       fd_batch_reference(&batch, fd_context(pctx)->batch);
-
-       fd_context_render(pctx);
+       struct fd_context *ctx = fd_context(pctx);
+       uint32_t timestamp;
+
+       if (!ctx->screen->reorder) {
+               struct fd_batch *batch = NULL;
+               fd_batch_reference(&batch, ctx->batch);
+               fd_batch_flush(batch);
+               timestamp = fd_ringbuffer_timestamp(batch->gmem);
+               fd_batch_reference(&batch, NULL);
+       } else {
+               timestamp = fd_bc_flush(&ctx->screen->batch_cache, ctx);
+       }
 
        if (fence) {
                fd_screen_fence_ref(pctx->screen, fence, NULL);
-               *fence = fd_fence_create(pctx, fd_ringbuffer_timestamp(batch->gmem));
+               *fence = fd_fence_create(pctx, timestamp);
        }
-
-       fd_batch_reference(&batch, NULL);
 }
 
 /**
@@ -81,9 +69,14 @@ static void
 fd_emit_string_marker(struct pipe_context *pctx, const char *string, int len)
 {
        struct fd_context *ctx = fd_context(pctx);
-       struct fd_ringbuffer *ring = ctx->batch->draw;
+       struct fd_ringbuffer *ring;
        const uint32_t *buf = (const void *)string;
 
+       if (!ctx->batch)
+               return;
+
+       ring = ctx->batch->draw;
+
        /* max packet size is 0x3fff dwords: */
        len = MIN2(len, 0x3fff * 4);
 
@@ -110,6 +103,9 @@ fd_context_destroy(struct pipe_context *pctx)
 
        DBG("");
 
+       fd_batch_reference(&ctx->batch, NULL);  /* unref current batch */
+       fd_bc_invalidate_context(ctx);
+
        fd_prog_fini(pctx);
        fd_hw_query_fini(pctx);
 
@@ -121,8 +117,6 @@ fd_context_destroy(struct pipe_context *pctx)
 
        util_slab_destroy(&ctx->transfer_pool);
 
-       fd_batch_reference(&ctx->batch, NULL);  /* unref current batch */
-
        for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) {
                struct fd_vsc_pipe *pipe = &ctx->pipe[i];
                if (!pipe->bo)
@@ -177,7 +171,12 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen,
        pctx->emit_string_marker = fd_emit_string_marker;
        pctx->set_debug_callback = fd_set_debug_callback;
 
-       ctx->batch = fd_batch_create(ctx);
+       /* TODO what about compute?  Ideally it creates it's own independent
+        * batches per compute job (since it isn't using tiling, so no point
+        * in getting involved with the re-ordering madness)..
+        */
+       if (!screen->reorder)
+               ctx->batch = fd_bc_alloc_batch(&screen->batch_cache, ctx);
 
        fd_reset_wfi(ctx);
 
index 9401367dca5ef005087d723ee26d98c00ffd7f2a..f25ec34758f5228893705ce48c993bd2f892da1e 100644 (file)
@@ -381,8 +381,6 @@ struct pipe_context * fd_context_init(struct fd_context *ctx,
                struct pipe_screen *pscreen, const uint8_t *primtypes,
                void *priv);
 
-void fd_context_render(struct pipe_context *pctx);
-
 void fd_context_destroy(struct pipe_context *pctx);
 
 #endif /* FREEDRENO_CONTEXT_H_ */
index b9477620176b0fbfab54cadde1e8c376b9d56847..f067715e5358d7a2a315d3d14be569a85a47b109 100644 (file)
@@ -44,7 +44,7 @@ resource_read(struct fd_batch *batch, struct pipe_resource *prsc)
 {
        if (!prsc)
                return;
-       fd_batch_resource_used(batch, fd_resource(prsc), FD_PENDING_READ);
+       fd_batch_resource_used(batch, fd_resource(prsc), false);
 }
 
 static void
@@ -52,7 +52,7 @@ resource_written(struct fd_batch *batch, struct pipe_resource *prsc)
 {
        if (!prsc)
                return;
-       fd_batch_resource_used(batch, fd_resource(prsc), FD_PENDING_WRITE);
+       fd_batch_resource_used(batch, fd_resource(prsc), true);
 }
 
 static void
index 76d90d604109a700ffaa8fe5224bb43ff4915eeb..4c450c62dc4f503aceb46738a056719d6cb15656 100644 (file)
@@ -216,7 +216,7 @@ fd_hw_get_query_result(struct fd_context *ctx, struct fd_query *q,
                if (!ctx->batch->needs_flush)
                        return true;
                DBG("reading query result forces flush!");
-               fd_context_render(&ctx->base);
+               fd_batch_flush(ctx->batch);
        }
 
        util_query_clear_result(result, q->type);
index 20d68fed9d546a7de07797f29e41d8eeac3e051c..ae8061cba28f365cd14d3435e4cea4b4b11465ca 100644 (file)
 #include "util/u_transfer.h"
 #include "util/u_string.h"
 #include "util/u_surface.h"
+#include "util/set.h"
 
 #include "freedreno_resource.h"
+#include "freedreno_batch_cache.h"
 #include "freedreno_screen.h"
 #include "freedreno_surface.h"
 #include "freedreno_context.h"
 #include "state_tracker/drm_driver.h"
 
 static bool
-pending(struct fd_resource *rsc, enum fd_resource_status status)
+pending(struct fd_resource *rsc, bool write)
 {
-       return (rsc->status & status) ||
-               (rsc->stencil && (rsc->stencil->status & status));
+       /* if we have a pending GPU write, we are busy in any case: */
+       if (rsc->write_batch)
+               return true;
+
+       /* if CPU wants to write, but we are pending a GPU read, we are busy: */
+       if (write && rsc->batch_mask)
+               return true;
+
+       if (rsc->stencil && pending(rsc->stencil, write))
+               return true;
+
+       return false;
 }
 
 static void
@@ -108,10 +120,8 @@ realloc_bo(struct fd_resource *rsc, uint32_t size)
 
        rsc->bo = fd_bo_new(screen->dev, size, flags);
        rsc->timestamp = 0;
-       rsc->status = 0;
-       fd_batch_reference(&rsc->pending_batch, NULL);
-       list_delinit(&rsc->list);
        util_range_set_empty(&rsc->valid_buffer_range);
+       fd_bc_invalidate_resource(rsc, true);
 }
 
 static unsigned
@@ -324,9 +334,18 @@ fd_resource_transfer_map(struct pipe_context *pctx,
                 * resource and we're trying to write to it, flush the renders.
                 */
                if (((ptrans->usage & PIPE_TRANSFER_WRITE) &&
-                                       pending(rsc, FD_PENDING_READ | FD_PENDING_WRITE)) ||
-                               pending(rsc, FD_PENDING_WRITE))
-                       fd_context_render(pctx);
+                                       pending(rsc, true)) ||
+                               pending(rsc, false)) {
+                       if (usage & PIPE_TRANSFER_WRITE) {
+                               struct fd_batch *batch;
+                               foreach_batch(batch, &ctx->screen->batch_cache, rsc->batch_mask) {
+                                       fd_batch_flush(batch);
+                               }
+                               assert(rsc->batch_mask == 0);
+                       } else {
+                               fd_batch_flush(rsc->write_batch);
+                       }
+               }
 
                /* The GPU keeps track of how the various bo's are being used, and
                 * will wait if necessary for the proper operation to have
@@ -451,10 +470,9 @@ fd_resource_destroy(struct pipe_screen *pscreen,
                struct pipe_resource *prsc)
 {
        struct fd_resource *rsc = fd_resource(prsc);
+       fd_bc_invalidate_resource(rsc, true);
        if (rsc->bo)
                fd_bo_del(rsc->bo);
-       fd_batch_reference(&rsc->pending_batch, NULL);
-       list_delinit(&rsc->list);
        util_range_destroy(&rsc->valid_buffer_range);
        FREE(rsc);
 }
@@ -570,7 +588,7 @@ fd_resource_create(struct pipe_screen *pscreen,
        *prsc = *tmpl;
 
        pipe_reference_init(&prsc->reference, 1);
-       list_inithead(&rsc->list);
+
        prsc->screen = pscreen;
 
        util_range_init(&rsc->valid_buffer_range);
@@ -657,7 +675,7 @@ fd_resource_from_handle(struct pipe_screen *pscreen,
        *prsc = *tmpl;
 
        pipe_reference_init(&prsc->reference, 1);
-       list_inithead(&rsc->list);
+
        prsc->screen = pscreen;
 
        util_range_init(&rsc->valid_buffer_range);
@@ -846,8 +864,10 @@ fd_flush_resource(struct pipe_context *pctx, struct pipe_resource *prsc)
 {
        struct fd_resource *rsc = fd_resource(prsc);
 
-       if (pending(rsc, FD_PENDING_WRITE | FD_PENDING_READ))
-               fd_context_render(pctx);
+       if (rsc->write_batch)
+               fd_batch_flush(rsc->write_batch);
+
+       assert(!rsc->write_batch);
 }
 
 void
index f8131c774ecedfc7ac2064e82d9ff0e973f9eca9..fcdb4c1e3648021bfec6f042149d97c5ad5d3112 100644 (file)
@@ -61,14 +61,7 @@ struct fd_resource_slice {
        uint32_t size0;          /* size of first layer in slice */
 };
 
-/* status of queued up but not flushed reads and write operations.
- * In _transfer_map() we need to know if queued up rendering needs
- * to be flushed to preserve the order of cpu and gpu access.
- */
-enum fd_resource_status {
-       FD_PENDING_WRITE = 0x01,
-       FD_PENDING_READ  = 0x02,
-};
+struct set;
 
 struct fd_resource {
        struct u_resource base;
@@ -86,13 +79,23 @@ struct fd_resource {
        /* TODO rename to secondary or auxiliary? */
        struct fd_resource *stencil;
 
-       /* pending read/write state: */
-       enum fd_resource_status status;
-       /* resources accessed by queued but not flushed draws are tracked
-        * in the used_resources list.
+       /* bitmask of in-flight batches which reference this resource.  Note
+        * that the batch doesn't hold reference to resources (but instead
+        * the fd_ringbuffer holds refs to the underlying fd_bo), but in case
+        * the resource is destroyed we need to clean up the batch's weak
+        * references to us.
+        */
+       uint32_t batch_mask;
+
+       /* reference to batch that writes this resource: */
+       struct fd_batch *write_batch;
+
+       /* Set of batches whose batch-cache key references this resource.
+        * We need to track this to know which batch-cache entries to
+        * invalidate if, for example, the resource is invalidated or
+        * shadowed.
         */
-       struct list_head list;
-       struct fd_batch *pending_batch;
+       uint32_t bc_batch_mask;
 };
 
 static inline struct fd_resource *
index 222f2881f0180ab791291eb591ce6c6eaf4fe85c..31f2cb2147aadf97962a3c6a545541948662cf5c 100644 (file)
@@ -75,6 +75,7 @@ static const struct debug_named_value debug_options[] = {
                {"flush",     FD_DBG_FLUSH,  "Force flush after every draw"},
                {"deqp",      FD_DBG_DEQP,   "Enable dEQP hacks"},
                {"nir",       FD_DBG_NIR,    "Prefer NIR as native IR"},
+               {"reorder",   FD_DBG_REORDER,"Enable reordering for draws/blits"},
                DEBUG_NAMED_VALUE_END
 };
 
@@ -134,6 +135,8 @@ fd_screen_destroy(struct pipe_screen *pscreen)
        if (screen->dev)
                fd_device_del(screen->dev);
 
+       fd_bc_fini(&screen->batch_cache);
+
        free(screen);
 }
 
@@ -662,6 +665,16 @@ fd_screen_create(struct fd_device *dev)
                goto fail;
        }
 
+       /* NOTE: don't enable reordering on a2xx, since completely untested.
+        * Also, don't enable if we have too old of a kernel to support
+        * growable cmdstream buffers, since memory requirement for cmdstream
+        * buffers would be too much otherwise.
+        */
+       if ((screen->gpu_id >= 300) && (fd_device_version(dev) >= FD_VERSION_UNLIMITED_CMDS))
+               screen->reorder = !!(fd_mesa_debug & FD_DBG_REORDER);
+
+       fd_bc_init(&screen->batch_cache);
+
        pscreen->destroy = fd_screen_destroy;
        pscreen->get_param = fd_screen_get_param;
        pscreen->get_paramf = fd_screen_get_paramf;
index 0c899d5a7f07d2933241ad3509b8a9e5026300b3..38d38f2f1abe0830a95d9bffbb982fe4f1d2f727 100644 (file)
@@ -35,7 +35,7 @@
 #include "pipe/p_screen.h"
 #include "util/u_memory.h"
 
-typedef uint32_t u32;
+#include "freedreno_batch_cache.h"
 
 struct fd_bo;
 
@@ -66,6 +66,10 @@ struct fd_screen {
        struct fd_pipe *pipe;
 
        int64_t cpu_gpu_time_delta;
+
+       struct fd_batch_cache batch_cache;
+
+       bool reorder;
 };
 
 static inline struct fd_screen *
index 98b56c7d512d84e61e6c484a1ddf798907e8e961..8ac41d290778b5a924979980b87c4bac01c0d09e 100644 (file)
@@ -117,10 +117,17 @@ fd_set_framebuffer_state(struct pipe_context *pctx,
        struct fd_context *ctx = fd_context(pctx);
        struct pipe_framebuffer_state *cso;
 
-       DBG("%d: cbufs[0]=%p, zsbuf=%p", ctx->batch->needs_flush,
-                       framebuffer->cbufs[0], framebuffer->zsbuf);
-
-       fd_context_render(pctx);
+       if (ctx->screen->reorder) {
+               struct fd_batch *batch =
+                       fd_batch_from_fb(&ctx->screen->batch_cache, ctx, framebuffer);
+               fd_batch_reference(&ctx->batch, NULL);
+               ctx->batch = batch;
+               ctx->dirty = ~0;
+       } else {
+               DBG("%d: cbufs[0]=%p, zsbuf=%p", ctx->batch->needs_flush,
+                               framebuffer->cbufs[0], framebuffer->zsbuf);
+               fd_batch_flush(ctx->batch);
+       }
 
        cso = &ctx->batch->framebuffer;
 
index 8f125d95554fba5745df28970578aaca3b0b721c..5cb958e65ab043b0e246b6032597a85cf4e1fcf1 100644 (file)
@@ -75,6 +75,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define FD_DBG_FLUSH    0x1000
 #define FD_DBG_DEQP     0x2000
 #define FD_DBG_NIR      0x4000
+#define FD_DBG_REORDER  0x8000
 
 extern int fd_mesa_debug;
 extern bool fd_binning_enabled;