From: Rob Clark Date: Mon, 27 Jun 2016 13:44:15 +0000 (-0400) Subject: freedreno: add batch-cache and batch reordering X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=9f219c7047b51561f6f69274d445e6a6ec41c5f8;p=mesa.git freedreno: add batch-cache and batch reordering Note that I originally also had a entry-point that would construct a key and do lookup from a pipe_surface. I ended up not needing that (yet?) but it is easy-enough to re-introduce later if we need it for the blit path. For now, not enabled by default, but can be enabled (on a3xx/a4xx) with FD_MESA_DEBUG=reorder. Signed-off-by: Rob Clark --- diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources index 4ba8c9dd19a..92d9186597c 100644 --- a/src/gallium/drivers/freedreno/Makefile.sources +++ b/src/gallium/drivers/freedreno/Makefile.sources @@ -4,6 +4,8 @@ C_SOURCES := \ disasm.h \ freedreno_batch.c \ freedreno_batch.h \ + freedreno_batch_cache.c \ + freedreno_batch_cache.h \ freedreno_context.c \ freedreno_context.h \ freedreno_draw.c \ diff --git a/src/gallium/drivers/freedreno/freedreno_batch.c b/src/gallium/drivers/freedreno/freedreno_batch.c index 1fbce43f62c..2dd7eda72ad 100644 --- a/src/gallium/drivers/freedreno/freedreno_batch.c +++ b/src/gallium/drivers/freedreno/freedreno_batch.c @@ -25,26 +25,20 @@ */ #include "util/list.h" +#include "util/set.h" +#include "util/hash_table.h" #include "util/u_string.h" #include "freedreno_batch.h" #include "freedreno_context.h" #include "freedreno_resource.h" -struct fd_batch * -fd_batch_create(struct fd_context *ctx) +static void +batch_init(struct fd_batch *batch) { - struct fd_batch *batch = CALLOC_STRUCT(fd_batch); - static unsigned seqno = 0; + struct fd_context *ctx = batch->ctx; unsigned size = 0; - if (!batch) - return NULL; - - pipe_reference_init(&batch->reference, 1); - batch->seqno = ++seqno; - batch->ctx = ctx; - /* if kernel is too old to support unlimited # of cmd buffers, we * have no option but to allocate large worst-case sizes so that * we don't need to grow the ringbuffer. Performance is likely to @@ -62,7 +56,11 @@ fd_batch_create(struct fd_context *ctx) fd_ringbuffer_set_parent(batch->draw, batch->gmem); fd_ringbuffer_set_parent(batch->binning, batch->gmem); - list_inithead(&batch->used_resources); + batch->cleared = batch->partial_cleared = 0; + batch->restore = batch->resolve = 0; + batch->needs_flush = false; + batch->gmem_reason = 0; + batch->num_draws = 0; /* reset maximal bounds: */ batch->max_scissor.minx = batch->max_scissor.miny = ~0; @@ -73,13 +71,33 @@ fd_batch_create(struct fd_context *ctx) if (is_a3xx(ctx->screen)) util_dynarray_init(&batch->rbrc_patches); + assert(batch->resources->entries == 0); +} + +struct fd_batch * +fd_batch_create(struct fd_context *ctx) +{ + struct fd_batch *batch = CALLOC_STRUCT(fd_batch); + + if (!batch) + return NULL; + + DBG("%p", batch); + + pipe_reference_init(&batch->reference, 1); + batch->ctx = ctx; + + batch->resources = _mesa_set_create(NULL, _mesa_hash_pointer, + _mesa_key_pointer_equal); + + batch_init(batch); + return batch; } -void -__fd_batch_destroy(struct fd_batch *batch) +static void +batch_fini(struct fd_batch *batch) { - util_copy_framebuffer_state(&batch->framebuffer, NULL); fd_ringbuffer_del(batch->draw); fd_ringbuffer_del(batch->binning); fd_ringbuffer_del(batch->gmem); @@ -88,6 +106,74 @@ __fd_batch_destroy(struct fd_batch *batch) if (is_a3xx(batch->ctx->screen)) util_dynarray_fini(&batch->rbrc_patches); +} + +static void +batch_flush_reset_dependencies(struct fd_batch *batch, bool flush) +{ + struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache; + struct fd_batch *dep; + + foreach_batch(dep, cache, batch->dependents_mask) { + if (flush) + fd_batch_flush(dep); + fd_batch_reference(&dep, NULL); + } + + batch->dependents_mask = 0; +} + +static void +batch_reset_resources(struct fd_batch *batch) +{ + struct set_entry *entry; + + set_foreach(batch->resources, entry) { + struct fd_resource *rsc = (struct fd_resource *)entry->key; + _mesa_set_remove(batch->resources, entry); + debug_assert(rsc->batch_mask & (1 << batch->idx)); + rsc->batch_mask &= ~(1 << batch->idx); + if (rsc->write_batch == batch) + fd_batch_reference(&rsc->write_batch, NULL); + } +} + +static void +batch_reset(struct fd_batch *batch) +{ + DBG("%p", batch); + + batch_flush_reset_dependencies(batch, false); + batch_reset_resources(batch); + + batch_fini(batch); + batch_init(batch); +} + +void +fd_batch_reset(struct fd_batch *batch) +{ + if (batch->needs_flush) + batch_reset(batch); +} + +void +__fd_batch_destroy(struct fd_batch *batch) +{ + fd_bc_invalidate_batch(batch, true); + + DBG("%p", batch); + + util_copy_framebuffer_state(&batch->framebuffer, NULL); + + batch_fini(batch); + + batch_reset_resources(batch); + debug_assert(batch->resources->entries == 0); + _mesa_set_destroy(batch->resources, NULL); + + batch_flush_reset_dependencies(batch, false); + debug_assert(batch->dependents_mask == 0); free(batch); } @@ -98,46 +184,125 @@ __fd_batch_describe(char* buf, const struct fd_batch *batch) util_sprintf(buf, "fd_batch<%u>", batch->seqno); } -void -fd_batch_flush(struct fd_batch *batch) +static void +batch_flush(struct fd_batch *batch) { - struct fd_resource *rsc, *rsc_tmp; - DBG("%p: needs_flush=%d", batch, batch->needs_flush); if (!batch->needs_flush) return; + batch->needs_flush = false; + + batch_flush_reset_dependencies(batch, true); + fd_gmem_render_tiles(batch); - /* go through all the used resources and clear their reading flag */ - LIST_FOR_EACH_ENTRY_SAFE(rsc, rsc_tmp, &batch->used_resources, list) { - debug_assert(rsc->pending_batch == batch); - debug_assert(rsc->status != 0); - rsc->status = 0; - fd_batch_reference(&rsc->pending_batch, NULL); - list_delinit(&rsc->list); - } + batch_reset_resources(batch); + + debug_assert(batch->reference.count > 0); - assert(LIST_IS_EMPTY(&batch->used_resources)); + if (batch == batch->ctx->batch) { + batch_reset(batch); + } else { + fd_bc_invalidate_batch(batch, false); + } } void -fd_batch_resource_used(struct fd_batch *batch, struct fd_resource *rsc, - enum fd_resource_status status) +fd_batch_flush(struct fd_batch *batch) +{ + /* NOTE: we need to hold an extra ref across the body of flush, + * since the last ref to this batch could be dropped when cleaning + * up used_resources + */ + struct fd_batch *tmp = NULL; + fd_batch_reference(&tmp, batch); + batch_flush(tmp); + fd_batch_reference(&tmp, NULL); +} + +/* does 'batch' depend directly or indirectly on 'other' ? */ +static bool +batch_depends_on(struct fd_batch *batch, struct fd_batch *other) +{ + struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache; + struct fd_batch *dep; + + if (batch->dependents_mask & (1 << other->idx)) + return true; + + foreach_batch(dep, cache, batch->dependents_mask) + if (batch_depends_on(batch, dep)) + return true; + + return false; +} + +static void +batch_add_dep(struct fd_batch *batch, struct fd_batch *dep) { - rsc->status |= status; + if (batch->dependents_mask & (1 << dep->idx)) + return; + /* if the new depedency already depends on us, we need to flush + * to avoid a loop in the dependency graph. + */ + if (batch_depends_on(dep, batch)) { + DBG("%p: flush forced on %p!", batch, dep); + fd_batch_flush(dep); + } else { + struct fd_batch *other = NULL; + fd_batch_reference(&other, dep); + batch->dependents_mask |= (1 << dep->idx); + DBG("%p: added dependency on %p", batch, dep); + } +} + +void +fd_batch_resource_used(struct fd_batch *batch, struct fd_resource *rsc, bool write) +{ if (rsc->stencil) - rsc->stencil->status |= status; + fd_batch_resource_used(batch, rsc->stencil, write); + + DBG("%p: %s %p", batch, write ? "write" : "read", rsc); - /* TODO resources can actually be shared across contexts, - * so I'm not sure a single list-head will do the trick? + /* note, invalidate write batch, to avoid further writes to rsc + * resulting in a write-after-read hazard. */ - debug_assert((rsc->pending_batch == batch) || !rsc->pending_batch); - list_delinit(&rsc->list); - list_addtail(&rsc->list, &batch->used_resources); - fd_batch_reference(&rsc->pending_batch, batch); + + if (write) { + /* if we are pending read or write by any other batch: */ + if (rsc->batch_mask != (1 << batch->idx)) { + struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache; + struct fd_batch *dep; + foreach_batch(dep, cache, rsc->batch_mask) { + struct fd_batch *b = NULL; + /* note that batch_add_dep could flush and unref dep, so + * we need to hold a reference to keep it live for the + * fd_bc_invalidate_batch() + */ + fd_batch_reference(&b, dep); + batch_add_dep(batch, b); + fd_bc_invalidate_batch(b, false); + fd_batch_reference_locked(&b, NULL); + } + } + fd_batch_reference(&rsc->write_batch, batch); + } else { + if (rsc->write_batch) { + batch_add_dep(batch, rsc->write_batch); + fd_bc_invalidate_batch(rsc->write_batch, false); + } + } + + if (rsc->batch_mask & (1 << batch->idx)) + return; + + debug_assert(!_mesa_set_search(batch->resources, rsc)); + + _mesa_set_add(batch->resources, rsc); + rsc->batch_mask |= (1 << batch->idx); } void @@ -149,5 +314,5 @@ fd_batch_check_size(struct fd_batch *batch) struct fd_ringbuffer *ring = batch->draw; if (((ring->cur - ring->start) > (ring->size/4 - 0x1000)) || (fd_mesa_debug & FD_DBG_FLUSH)) - fd_context_render(&batch->ctx->base); + fd_batch_flush(batch); } diff --git a/src/gallium/drivers/freedreno/freedreno_batch.h b/src/gallium/drivers/freedreno/freedreno_batch.h index 4607250d3af..89d1d9fea7b 100644 --- a/src/gallium/drivers/freedreno/freedreno_batch.h +++ b/src/gallium/drivers/freedreno/freedreno_batch.h @@ -42,6 +42,7 @@ enum fd_resource_status; struct fd_batch { struct pipe_reference reference; unsigned seqno; + unsigned idx; struct fd_context *ctx; @@ -117,15 +118,24 @@ struct fd_batch { /** tiling/gmem (IB0) cmdstream: */ struct fd_ringbuffer *gmem; - /** list of resources used by currently-unsubmitted batch */ - struct list_head used_resources; + /* Set of resources used by currently-unsubmitted batch (read or + * write).. does not hold a reference to the resource. + */ + struct set *resources; + + /** key in batch-cache (if not null): */ + const void *key; + uint32_t hash; + + /** set of dependent batches.. holds refs to dependent batches: */ + uint32_t dependents_mask; }; struct fd_batch * fd_batch_create(struct fd_context *ctx); +void fd_batch_reset(struct fd_batch *batch); void fd_batch_flush(struct fd_batch *batch); -void fd_batch_resource_used(struct fd_batch *batch, struct fd_resource *rsc, - enum fd_resource_status status); +void fd_batch_resource_used(struct fd_batch *batch, struct fd_resource *rsc, bool write); void fd_batch_check_size(struct fd_batch *batch); /* not called directly: */ diff --git a/src/gallium/drivers/freedreno/freedreno_batch_cache.c b/src/gallium/drivers/freedreno/freedreno_batch_cache.c new file mode 100644 index 00000000000..c947a559df9 --- /dev/null +++ b/src/gallium/drivers/freedreno/freedreno_batch_cache.c @@ -0,0 +1,354 @@ +/* + * Copyright (C) 2016 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#include "util/hash_table.h" +#include "util/set.h" +#include "util/list.h" +#include "util/u_string.h" + +#include "freedreno_batch.h" +#include "freedreno_batch_cache.h" +#include "freedreno_context.h" +#include "freedreno_resource.h" + +/* Overview: + * + * The batch cache provides lookup for mapping pipe_framebuffer_state + * to a batch. + * + * It does this via hashtable, with key that roughly matches the + * pipe_framebuffer_state, as described below. + * + * Batch Cache hashtable key: + * + * To serialize the key, and to avoid dealing with holding a reference to + * pipe_surface's (which hold a reference to pipe_resource and complicate + * the whole refcnting thing), the key is variable length and inline's the + * pertinent details of the pipe_surface. + * + * Batch: + * + * Each batch needs to hold a reference to each resource it depends on (ie. + * anything that needs a mem2gmem). And a weak reference to resources it + * renders to. (If both src[n] and dst[n] are not NULL then they are the + * same.) + * + * When a resource is destroyed, we need to remove entries in the batch + * cache that reference the resource, to avoid dangling pointer issues. + * So each resource holds a hashset of batches which have reference them + * in their hashtable key. + * + * When a batch has weak reference to no more resources (ie. all the + * surfaces it rendered to are destroyed) the batch can be destroyed. + * Could happen in an app that renders and never uses the result. More + * common scenario, I think, will be that some, but not all, of the + * surfaces are destroyed before the batch is submitted. + * + * If (for example), batch writes to zsbuf but that surface is destroyed + * before batch is submitted, we can skip gmem2mem (but still need to + * alloc gmem space as before. If the batch depended on previous contents + * of that surface, it would be holding a reference so the surface would + * not have been destroyed. + */ + +struct key { + uint32_t width, height, layers; + uint16_t samples, num_surfs; + struct fd_context *ctx; + struct { + struct pipe_resource *texture; + union pipe_surface_desc u; + uint16_t pos, format; + } surf[0]; +}; + +static struct key * +key_alloc(unsigned num_surfs) +{ + struct key *key = + CALLOC_VARIANT_LENGTH_STRUCT(key, sizeof(key->surf[0]) * num_surfs); + return key; +} + +static uint32_t +key_hash(const void *_key) +{ + const struct key *key = _key; + uint32_t hash = _mesa_fnv32_1a_offset_bias; + hash = _mesa_fnv32_1a_accumulate_block(hash, key, offsetof(struct key, surf[0])); + hash = _mesa_fnv32_1a_accumulate_block(hash, key->surf, sizeof(key->surf[0]) * key->num_surfs); + return hash; +} + +static bool +key_equals(const void *_a, const void *_b) +{ + const struct key *a = _a; + const struct key *b = _b; + return (memcmp(a, b, offsetof(struct key, surf[0])) == 0) && + (memcmp(a->surf, b->surf, sizeof(a->surf[0]) * a->num_surfs) == 0); +} + +void +fd_bc_init(struct fd_batch_cache *cache) +{ + cache->ht = _mesa_hash_table_create(NULL, key_hash, key_equals); +} + +void +fd_bc_fini(struct fd_batch_cache *cache) +{ + _mesa_hash_table_destroy(cache->ht, NULL); +} + +uint32_t +fd_bc_flush(struct fd_batch_cache *cache, struct fd_context *ctx) +{ + struct hash_entry *entry; + uint32_t timestamp = 0; + + hash_table_foreach(cache->ht, entry) { + struct fd_batch *batch = NULL; + fd_batch_reference(&batch, (struct fd_batch *)entry->data); + if (batch->ctx == ctx) { + fd_batch_flush(batch); + timestamp = MAX2(timestamp, fd_ringbuffer_timestamp(batch->gmem)); + } + fd_batch_reference(&batch, NULL); + } + + return timestamp; +} + +void +fd_bc_invalidate_context(struct fd_context *ctx) +{ + struct fd_batch_cache *cache = &ctx->screen->batch_cache; + struct fd_batch *batch; + + foreach_batch(batch, cache, cache->batch_mask) { + if (batch->ctx == ctx) { + fd_batch_reset(batch); + fd_batch_reference(&batch, NULL); + } + } +} + +void +fd_bc_invalidate_batch(struct fd_batch *batch, bool destroy) +{ + struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache; + struct key *key = (struct key *)batch->key; + + if (destroy) { + cache->batches[batch->idx] = NULL; + cache->batch_mask &= ~(1 << batch->idx); + } + + if (!key) + return; + + DBG("%p: key=%p", batch, batch->key); + for (unsigned idx = 0; idx < key->num_surfs; idx++) { + struct fd_resource *rsc = fd_resource(key->surf[idx].texture); + rsc->bc_batch_mask &= ~(1 << batch->idx); + } + + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(cache->ht, batch->hash, key); + _mesa_hash_table_remove(cache->ht, entry); + + batch->key = NULL; + free(key); +} + +void +fd_bc_invalidate_resource(struct fd_resource *rsc, bool destroy) +{ + struct fd_screen *screen = fd_screen(rsc->base.b.screen); + struct fd_batch *batch; + + if (destroy) { + foreach_batch(batch, &screen->batch_cache, rsc->batch_mask) { + struct set_entry *entry = _mesa_set_search(batch->resources, rsc); + _mesa_set_remove(batch->resources, entry); + } + rsc->batch_mask = 0; + + fd_batch_reference(&rsc->write_batch, NULL); + } + + foreach_batch(batch, &screen->batch_cache, rsc->bc_batch_mask) + fd_bc_invalidate_batch(batch, false); + + rsc->bc_batch_mask = 0; +} + +struct fd_batch * +fd_bc_alloc_batch(struct fd_batch_cache *cache, struct fd_context *ctx) +{ + struct fd_batch *batch; + uint32_t idx; + + while ((idx = ffs(~cache->batch_mask)) == 0) { +#if 0 + for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) { + batch = cache->batches[i]; + debug_printf("%d: needs_flush=%d, depends:", batch->idx, batch->needs_flush); + struct set_entry *entry; + set_foreach(batch->dependencies, entry) { + struct fd_batch *dep = (struct fd_batch *)entry->key; + debug_printf(" %d", dep->idx); + } + debug_printf("\n"); + } +#endif + /* TODO: is LRU the better policy? Or perhaps the batch that + * depends on the fewest other batches? + */ + struct fd_batch *flush_batch = NULL; + for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) { + if ((cache->batches[i] == ctx->batch) || + !cache->batches[i]->needs_flush) + continue; + if (!flush_batch || (cache->batches[i]->seqno < flush_batch->seqno)) + fd_batch_reference(&flush_batch, cache->batches[i]); + } + DBG("%p: too many batches! flush forced!", flush_batch); + fd_batch_flush(flush_batch); + + /* While the resources get cleaned up automatically, the flush_batch + * doesn't get removed from the dependencies of other batches, so + * it won't be unref'd and will remain in the table. + * + * TODO maybe keep a bitmask of batches that depend on me, to make + * this easier: + */ + for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) { + struct fd_batch *other = cache->batches[i]; + if (!other) + continue; + if (other->dependents_mask & (1 << flush_batch->idx)) { + other->dependents_mask &= ~(1 << flush_batch->idx); + struct fd_batch *ref = flush_batch; + fd_batch_reference(&ref, NULL); + } + } + + fd_batch_reference(&flush_batch, NULL); + } + + idx--; /* bit zero returns 1 for ffs() */ + + batch = fd_batch_create(ctx); + if (!batch) + return NULL; + + batch->seqno = cache->cnt++; + batch->idx = idx; + cache->batch_mask |= (1 << idx); + + debug_assert(cache->batches[idx] == NULL); + cache->batches[idx] = batch; + + return batch; +} + +static struct fd_batch * +batch_from_key(struct fd_batch_cache *cache, struct key *key, + struct fd_context *ctx) +{ + struct fd_batch *batch = NULL; + uint32_t hash = key_hash(key); + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(cache->ht, hash, key); + + if (entry) { + free(key); + fd_batch_reference(&batch, (struct fd_batch *)entry->data); + return batch; + } + + batch = fd_bc_alloc_batch(cache, ctx); +#ifdef DEBUG + DBG("%p: hash=0x%08x, %ux%u, %u layers, %u samples", batch, hash, + key->width, key->height, key->layers, key->samples); + for (unsigned idx = 0; idx < key->num_surfs; idx++) { + DBG("%p: surf[%u]: %p (%s) (%u,%u / %u,%u,%u)", batch, key->surf[idx].pos, + key->surf[idx].texture, util_format_name(key->surf[idx].format), + key->surf[idx].u.buf.first_element, key->surf[idx].u.buf.last_element, + key->surf[idx].u.tex.first_layer, key->surf[idx].u.tex.last_layer, + key->surf[idx].u.tex.level); + } +#endif + if (!batch) + return NULL; + + _mesa_hash_table_insert_pre_hashed(cache->ht, hash, key, batch); + batch->key = key; + batch->hash = hash; + + for (unsigned idx = 0; idx < key->num_surfs; idx++) { + struct fd_resource *rsc = fd_resource(key->surf[idx].texture); + rsc->bc_batch_mask = (1 << batch->idx); + } + + return batch; +} + +static void +key_surf(struct key *key, unsigned idx, unsigned pos, struct pipe_surface *psurf) +{ + key->surf[idx].texture = psurf->texture; + key->surf[idx].u = psurf->u; + key->surf[idx].pos = pos; + key->surf[idx].format = psurf->format; +} + +struct fd_batch * +fd_batch_from_fb(struct fd_batch_cache *cache, struct fd_context *ctx, + const struct pipe_framebuffer_state *pfb) +{ + unsigned idx = 0, n = pfb->nr_cbufs + (pfb->zsbuf ? 1 : 0); + struct key *key = key_alloc(n); + + key->width = pfb->width; + key->height = pfb->height; + key->layers = pfb->layers; + key->samples = pfb->samples; + key->ctx = ctx; + + if (pfb->zsbuf) + key_surf(key, idx++, 0, pfb->zsbuf); + + for (unsigned i = 0; i < pfb->nr_cbufs; i++) + if (pfb->cbufs[i]) + key_surf(key, idx++, i + 1, pfb->cbufs[i]); + + key->num_surfs = idx; + + return batch_from_key(cache, key, ctx); +} diff --git a/src/gallium/drivers/freedreno/freedreno_batch_cache.h b/src/gallium/drivers/freedreno/freedreno_batch_cache.h new file mode 100644 index 00000000000..90500d50121 --- /dev/null +++ b/src/gallium/drivers/freedreno/freedreno_batch_cache.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2016 Rob Clark + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: + * Rob Clark + */ + +#ifndef FREEDRENO_BATCH_CACHE_H_ +#define FREEDRENO_BATCH_CACHE_H_ + +#include "pipe/p_state.h" + +#include "freedreno_batch.h" + +struct hash_table; + +struct fd_batch_cache { + struct hash_table *ht; + unsigned cnt; + + /* set of active batches.. there is an upper limit on the number of + * in-flight batches, for two reasons: + * 1) to avoid big spikes in number of batches in edge cases, such as + * game startup (ie, lots of texture uploads, but no usages yet of + * the textures), etc. + * 2) so we can use a simple bitmask in fd_resource to track which + * batches have reference to the resource + */ + struct fd_batch *batches[32]; + uint32_t batch_mask; +}; + +/* note: if batches get unref'd in the body of the loop, they are removed + * from the various masks.. but since we copy the mask at the beginning of + * the loop into _m, we need the &= at the end of the loop to make sure + * we don't have stale bits in _m + */ +#define foreach_batch(batch, cache, mask) \ + for (uint32_t _m = (mask); _m && ((batch) = (cache)->batches[u_bit_scan(&_m)]); _m &= (mask)) + +void fd_bc_init(struct fd_batch_cache *cache); +void fd_bc_fini(struct fd_batch_cache *cache); + +uint32_t fd_bc_flush(struct fd_batch_cache *cache, struct fd_context *ctx); + +void fd_bc_invalidate_context(struct fd_context *ctx); +void fd_bc_invalidate_batch(struct fd_batch *batch, bool destroy); +void fd_bc_invalidate_resource(struct fd_resource *rsc, bool destroy); +struct fd_batch * fd_bc_alloc_batch(struct fd_batch_cache *cache, struct fd_context *ctx); + +struct fd_batch * fd_batch_from_fb(struct fd_batch_cache *cache, + struct fd_context *ctx, const struct pipe_framebuffer_state *pfb); + +#endif /* FREEDRENO_BATCH_CACHE_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c index b9a1fe97745..e81d31175f0 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.c +++ b/src/gallium/drivers/freedreno/freedreno_context.c @@ -38,39 +38,27 @@ #include "freedreno_query_hw.h" #include "freedreno_util.h" -/* emit accumulated render cmds, needed for example if render target has - * changed, or for flush() - */ -void -fd_context_render(struct pipe_context *pctx) -{ - struct fd_context *ctx = fd_context(pctx); - struct fd_batch *new_batch; - - fd_batch_flush(ctx->batch); - - new_batch = fd_batch_create(ctx); - util_copy_framebuffer_state(&new_batch->framebuffer, &ctx->batch->framebuffer); - fd_batch_reference(&ctx->batch, NULL); - ctx->batch = new_batch; -} - static void fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence, unsigned flags) { - struct fd_batch *batch = NULL; - - fd_batch_reference(&batch, fd_context(pctx)->batch); - - fd_context_render(pctx); + struct fd_context *ctx = fd_context(pctx); + uint32_t timestamp; + + if (!ctx->screen->reorder) { + struct fd_batch *batch = NULL; + fd_batch_reference(&batch, ctx->batch); + fd_batch_flush(batch); + timestamp = fd_ringbuffer_timestamp(batch->gmem); + fd_batch_reference(&batch, NULL); + } else { + timestamp = fd_bc_flush(&ctx->screen->batch_cache, ctx); + } if (fence) { fd_screen_fence_ref(pctx->screen, fence, NULL); - *fence = fd_fence_create(pctx, fd_ringbuffer_timestamp(batch->gmem)); + *fence = fd_fence_create(pctx, timestamp); } - - fd_batch_reference(&batch, NULL); } /** @@ -81,9 +69,14 @@ static void fd_emit_string_marker(struct pipe_context *pctx, const char *string, int len) { struct fd_context *ctx = fd_context(pctx); - struct fd_ringbuffer *ring = ctx->batch->draw; + struct fd_ringbuffer *ring; const uint32_t *buf = (const void *)string; + if (!ctx->batch) + return; + + ring = ctx->batch->draw; + /* max packet size is 0x3fff dwords: */ len = MIN2(len, 0x3fff * 4); @@ -110,6 +103,9 @@ fd_context_destroy(struct pipe_context *pctx) DBG(""); + fd_batch_reference(&ctx->batch, NULL); /* unref current batch */ + fd_bc_invalidate_context(ctx); + fd_prog_fini(pctx); fd_hw_query_fini(pctx); @@ -121,8 +117,6 @@ fd_context_destroy(struct pipe_context *pctx) util_slab_destroy(&ctx->transfer_pool); - fd_batch_reference(&ctx->batch, NULL); /* unref current batch */ - for (i = 0; i < ARRAY_SIZE(ctx->pipe); i++) { struct fd_vsc_pipe *pipe = &ctx->pipe[i]; if (!pipe->bo) @@ -177,7 +171,12 @@ fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen, pctx->emit_string_marker = fd_emit_string_marker; pctx->set_debug_callback = fd_set_debug_callback; - ctx->batch = fd_batch_create(ctx); + /* TODO what about compute? Ideally it creates it's own independent + * batches per compute job (since it isn't using tiling, so no point + * in getting involved with the re-ordering madness).. + */ + if (!screen->reorder) + ctx->batch = fd_bc_alloc_batch(&screen->batch_cache, ctx); fd_reset_wfi(ctx); diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h index 9401367dca5..f25ec34758f 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.h +++ b/src/gallium/drivers/freedreno/freedreno_context.h @@ -381,8 +381,6 @@ struct pipe_context * fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen, const uint8_t *primtypes, void *priv); -void fd_context_render(struct pipe_context *pctx); - void fd_context_destroy(struct pipe_context *pctx); #endif /* FREEDRENO_CONTEXT_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c index b9477620176..f067715e535 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.c +++ b/src/gallium/drivers/freedreno/freedreno_draw.c @@ -44,7 +44,7 @@ resource_read(struct fd_batch *batch, struct pipe_resource *prsc) { if (!prsc) return; - fd_batch_resource_used(batch, fd_resource(prsc), FD_PENDING_READ); + fd_batch_resource_used(batch, fd_resource(prsc), false); } static void @@ -52,7 +52,7 @@ resource_written(struct fd_batch *batch, struct pipe_resource *prsc) { if (!prsc) return; - fd_batch_resource_used(batch, fd_resource(prsc), FD_PENDING_WRITE); + fd_batch_resource_used(batch, fd_resource(prsc), true); } static void diff --git a/src/gallium/drivers/freedreno/freedreno_query_hw.c b/src/gallium/drivers/freedreno/freedreno_query_hw.c index 76d90d60410..4c450c62dc4 100644 --- a/src/gallium/drivers/freedreno/freedreno_query_hw.c +++ b/src/gallium/drivers/freedreno/freedreno_query_hw.c @@ -216,7 +216,7 @@ fd_hw_get_query_result(struct fd_context *ctx, struct fd_query *q, if (!ctx->batch->needs_flush) return true; DBG("reading query result forces flush!"); - fd_context_render(&ctx->base); + fd_batch_flush(ctx->batch); } util_query_clear_result(result, q->type); diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c index 20d68fed9d5..ae8061cba28 100644 --- a/src/gallium/drivers/freedreno/freedreno_resource.c +++ b/src/gallium/drivers/freedreno/freedreno_resource.c @@ -33,8 +33,10 @@ #include "util/u_transfer.h" #include "util/u_string.h" #include "util/u_surface.h" +#include "util/set.h" #include "freedreno_resource.h" +#include "freedreno_batch_cache.h" #include "freedreno_screen.h" #include "freedreno_surface.h" #include "freedreno_context.h" @@ -47,10 +49,20 @@ #include "state_tracker/drm_driver.h" static bool -pending(struct fd_resource *rsc, enum fd_resource_status status) +pending(struct fd_resource *rsc, bool write) { - return (rsc->status & status) || - (rsc->stencil && (rsc->stencil->status & status)); + /* if we have a pending GPU write, we are busy in any case: */ + if (rsc->write_batch) + return true; + + /* if CPU wants to write, but we are pending a GPU read, we are busy: */ + if (write && rsc->batch_mask) + return true; + + if (rsc->stencil && pending(rsc->stencil, write)) + return true; + + return false; } static void @@ -108,10 +120,8 @@ realloc_bo(struct fd_resource *rsc, uint32_t size) rsc->bo = fd_bo_new(screen->dev, size, flags); rsc->timestamp = 0; - rsc->status = 0; - fd_batch_reference(&rsc->pending_batch, NULL); - list_delinit(&rsc->list); util_range_set_empty(&rsc->valid_buffer_range); + fd_bc_invalidate_resource(rsc, true); } static unsigned @@ -324,9 +334,18 @@ fd_resource_transfer_map(struct pipe_context *pctx, * resource and we're trying to write to it, flush the renders. */ if (((ptrans->usage & PIPE_TRANSFER_WRITE) && - pending(rsc, FD_PENDING_READ | FD_PENDING_WRITE)) || - pending(rsc, FD_PENDING_WRITE)) - fd_context_render(pctx); + pending(rsc, true)) || + pending(rsc, false)) { + if (usage & PIPE_TRANSFER_WRITE) { + struct fd_batch *batch; + foreach_batch(batch, &ctx->screen->batch_cache, rsc->batch_mask) { + fd_batch_flush(batch); + } + assert(rsc->batch_mask == 0); + } else { + fd_batch_flush(rsc->write_batch); + } + } /* The GPU keeps track of how the various bo's are being used, and * will wait if necessary for the proper operation to have @@ -451,10 +470,9 @@ fd_resource_destroy(struct pipe_screen *pscreen, struct pipe_resource *prsc) { struct fd_resource *rsc = fd_resource(prsc); + fd_bc_invalidate_resource(rsc, true); if (rsc->bo) fd_bo_del(rsc->bo); - fd_batch_reference(&rsc->pending_batch, NULL); - list_delinit(&rsc->list); util_range_destroy(&rsc->valid_buffer_range); FREE(rsc); } @@ -570,7 +588,7 @@ fd_resource_create(struct pipe_screen *pscreen, *prsc = *tmpl; pipe_reference_init(&prsc->reference, 1); - list_inithead(&rsc->list); + prsc->screen = pscreen; util_range_init(&rsc->valid_buffer_range); @@ -657,7 +675,7 @@ fd_resource_from_handle(struct pipe_screen *pscreen, *prsc = *tmpl; pipe_reference_init(&prsc->reference, 1); - list_inithead(&rsc->list); + prsc->screen = pscreen; util_range_init(&rsc->valid_buffer_range); @@ -846,8 +864,10 @@ fd_flush_resource(struct pipe_context *pctx, struct pipe_resource *prsc) { struct fd_resource *rsc = fd_resource(prsc); - if (pending(rsc, FD_PENDING_WRITE | FD_PENDING_READ)) - fd_context_render(pctx); + if (rsc->write_batch) + fd_batch_flush(rsc->write_batch); + + assert(!rsc->write_batch); } void diff --git a/src/gallium/drivers/freedreno/freedreno_resource.h b/src/gallium/drivers/freedreno/freedreno_resource.h index f8131c774ec..fcdb4c1e364 100644 --- a/src/gallium/drivers/freedreno/freedreno_resource.h +++ b/src/gallium/drivers/freedreno/freedreno_resource.h @@ -61,14 +61,7 @@ struct fd_resource_slice { uint32_t size0; /* size of first layer in slice */ }; -/* status of queued up but not flushed reads and write operations. - * In _transfer_map() we need to know if queued up rendering needs - * to be flushed to preserve the order of cpu and gpu access. - */ -enum fd_resource_status { - FD_PENDING_WRITE = 0x01, - FD_PENDING_READ = 0x02, -}; +struct set; struct fd_resource { struct u_resource base; @@ -86,13 +79,23 @@ struct fd_resource { /* TODO rename to secondary or auxiliary? */ struct fd_resource *stencil; - /* pending read/write state: */ - enum fd_resource_status status; - /* resources accessed by queued but not flushed draws are tracked - * in the used_resources list. + /* bitmask of in-flight batches which reference this resource. Note + * that the batch doesn't hold reference to resources (but instead + * the fd_ringbuffer holds refs to the underlying fd_bo), but in case + * the resource is destroyed we need to clean up the batch's weak + * references to us. + */ + uint32_t batch_mask; + + /* reference to batch that writes this resource: */ + struct fd_batch *write_batch; + + /* Set of batches whose batch-cache key references this resource. + * We need to track this to know which batch-cache entries to + * invalidate if, for example, the resource is invalidated or + * shadowed. */ - struct list_head list; - struct fd_batch *pending_batch; + uint32_t bc_batch_mask; }; static inline struct fd_resource * diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 222f2881f01..31f2cb2147a 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -75,6 +75,7 @@ static const struct debug_named_value debug_options[] = { {"flush", FD_DBG_FLUSH, "Force flush after every draw"}, {"deqp", FD_DBG_DEQP, "Enable dEQP hacks"}, {"nir", FD_DBG_NIR, "Prefer NIR as native IR"}, + {"reorder", FD_DBG_REORDER,"Enable reordering for draws/blits"}, DEBUG_NAMED_VALUE_END }; @@ -134,6 +135,8 @@ fd_screen_destroy(struct pipe_screen *pscreen) if (screen->dev) fd_device_del(screen->dev); + fd_bc_fini(&screen->batch_cache); + free(screen); } @@ -662,6 +665,16 @@ fd_screen_create(struct fd_device *dev) goto fail; } + /* NOTE: don't enable reordering on a2xx, since completely untested. + * Also, don't enable if we have too old of a kernel to support + * growable cmdstream buffers, since memory requirement for cmdstream + * buffers would be too much otherwise. + */ + if ((screen->gpu_id >= 300) && (fd_device_version(dev) >= FD_VERSION_UNLIMITED_CMDS)) + screen->reorder = !!(fd_mesa_debug & FD_DBG_REORDER); + + fd_bc_init(&screen->batch_cache); + pscreen->destroy = fd_screen_destroy; pscreen->get_param = fd_screen_get_param; pscreen->get_paramf = fd_screen_get_paramf; diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h index 0c899d5a7f0..38d38f2f1ab 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.h +++ b/src/gallium/drivers/freedreno/freedreno_screen.h @@ -35,7 +35,7 @@ #include "pipe/p_screen.h" #include "util/u_memory.h" -typedef uint32_t u32; +#include "freedreno_batch_cache.h" struct fd_bo; @@ -66,6 +66,10 @@ struct fd_screen { struct fd_pipe *pipe; int64_t cpu_gpu_time_delta; + + struct fd_batch_cache batch_cache; + + bool reorder; }; static inline struct fd_screen * diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c index 98b56c7d512..8ac41d29077 100644 --- a/src/gallium/drivers/freedreno/freedreno_state.c +++ b/src/gallium/drivers/freedreno/freedreno_state.c @@ -117,10 +117,17 @@ fd_set_framebuffer_state(struct pipe_context *pctx, struct fd_context *ctx = fd_context(pctx); struct pipe_framebuffer_state *cso; - DBG("%d: cbufs[0]=%p, zsbuf=%p", ctx->batch->needs_flush, - framebuffer->cbufs[0], framebuffer->zsbuf); - - fd_context_render(pctx); + if (ctx->screen->reorder) { + struct fd_batch *batch = + fd_batch_from_fb(&ctx->screen->batch_cache, ctx, framebuffer); + fd_batch_reference(&ctx->batch, NULL); + ctx->batch = batch; + ctx->dirty = ~0; + } else { + DBG("%d: cbufs[0]=%p, zsbuf=%p", ctx->batch->needs_flush, + framebuffer->cbufs[0], framebuffer->zsbuf); + fd_batch_flush(ctx->batch); + } cso = &ctx->batch->framebuffer; diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h index 8f125d95554..5cb958e65ab 100644 --- a/src/gallium/drivers/freedreno/freedreno_util.h +++ b/src/gallium/drivers/freedreno/freedreno_util.h @@ -75,6 +75,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op); #define FD_DBG_FLUSH 0x1000 #define FD_DBG_DEQP 0x2000 #define FD_DBG_NIR 0x4000 +#define FD_DBG_REORDER 0x8000 extern int fd_mesa_debug; extern bool fd_binning_enabled;