/** @file brw_state_cache.c
*
- * This file implements a simple static state cache for 965. The consumers
- * can query the hash table of state using a cache_id, opaque key data,
- * and list of buffers that will be used in relocations, and receive the
- * corresponding state buffer object of state (plus associated auxiliary
- * data) in return.
+ * This file implements a simple static state cache for 965. The
+ * consumers can query the hash table of state using a cache_id,
+ * opaque key data, and receive the corresponding state buffer object
+ * of state (plus associated auxiliary data) in return. Objects in
+ * the cache may not have relocations (pointers to other BOs) in them.
*
- * The inner workings are a simple hash table based on a CRC of the key data.
- * The cache_id and relocation target buffers associated with the state
- * buffer are included as auxiliary key data, but are not part of the hash
- * value (this should be fixed, but will likely be fixed instead by making
- * consumers use structured keys).
+ * The inner workings are a simple hash table based on a CRC of the
+ * key data.
*
- * Replacement is not implemented. Instead, when the cache gets too big, at
- * a safe point (unlock) we throw out all of the cache data and let it
- * regenerate for the next rendering operation.
- *
- * The reloc_buf pointers need to be included as key data, otherwise the
- * non-unique values stuffed in the offset in key data through
- * brw_cache_data() may result in successful probe for state buffers
- * even when the buffer being referenced doesn't match. The result would be
- * that the same state cache entry is used twice for different buffers,
- * only one of the two buffers referenced gets put into the offset, and the
- * incorrect program is run for the other instance.
+ * Replacement is not implemented. Instead, when the cache gets too
+ * big we throw out all of the cache data and let it get regenerated.
*/
#include "main/imports.h"
-#include "brw_state.h"
#include "intel_batchbuffer.h"
+#include "brw_state.h"
+#include "brw_vs.h"
#include "brw_wm.h"
+#include "brw_vs.h"
#define FILE_DEBUG_FLAG DEBUG_STATE
hash = (hash << 5) | (hash >> 27);
}
- /* Include the BO pointers as key data as well */
- ikey = (GLuint *)item->reloc_bufs;
- for (i = 0; i < item->nr_reloc_bufs * sizeof(drm_intel_bo *) / 4; i++) {
- hash ^= ikey[i];
- hash = (hash << 5) | (hash >> 27);
- }
-
return hash;
}
-
-/**
- * Marks a new buffer as being chosen for the given cache id.
- */
-static void
-update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
- drm_intel_bo *bo)
-{
- if (bo == cache->last_bo[cache_id])
- return; /* no change */
-
- drm_intel_bo_unreference(cache->last_bo[cache_id]);
- cache->last_bo[cache_id] = bo;
- drm_intel_bo_reference(cache->last_bo[cache_id]);
- cache->brw->state.dirty.cache |= 1 << cache_id;
-}
-
static int
brw_cache_item_equals(const struct brw_cache_item *a,
const struct brw_cache_item *b)
return a->cache_id == b->cache_id &&
a->hash == b->hash &&
a->key_size == b->key_size &&
- (memcmp(a->key, b->key, a->key_size) == 0) &&
- a->nr_reloc_bufs == b->nr_reloc_bufs &&
- (memcmp(a->reloc_bufs, b->reloc_bufs,
- a->nr_reloc_bufs * sizeof(drm_intel_bo *)) == 0);
+ (memcmp(a->key, b->key, a->key_size) == 0);
}
static struct brw_cache_item *
GLuint size, i;
size = cache->size * 3;
- items = (struct brw_cache_item**) calloc(1, size * sizeof(*items));
+ items = calloc(1, size * sizeof(*items));
for (i = 0; i < cache->size; i++)
for (c = cache->items[i]; c; c = next) {
items[c->hash % size] = c;
}
- FREE(cache->items);
+ free(cache->items);
cache->items = items;
cache->size = size;
}
/**
* Returns the buffer object matching cache_id and key, or NULL.
*/
-drm_intel_bo *
+bool
brw_search_cache(struct brw_cache *cache,
enum brw_cache_id cache_id,
- const void *key,
- GLuint key_size,
- drm_intel_bo **reloc_bufs, GLuint nr_reloc_bufs,
- void *aux_return)
+ const void *key, GLuint key_size,
+ uint32_t *inout_offset, void *out_aux)
{
+ struct brw_context *brw = cache->brw;
struct brw_cache_item *item;
struct brw_cache_item lookup;
GLuint hash;
lookup.cache_id = cache_id;
lookup.key = key;
lookup.key_size = key_size;
- lookup.reloc_bufs = reloc_bufs;
- lookup.nr_reloc_bufs = nr_reloc_bufs;
hash = hash_key(&lookup);
lookup.hash = hash;
item = search_cache(cache, hash, &lookup);
if (item == NULL)
- return NULL;
+ return false;
- if (aux_return)
- *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+ *(void **)out_aux = ((char *)item->key + item->key_size);
- update_cache_last(cache, cache_id, item->bo);
+ if (item->offset != *inout_offset) {
+ brw->state.dirty.cache |= (1 << cache_id);
+ *inout_offset = item->offset;
+ }
- drm_intel_bo_reference(item->bo);
- return item->bo;
+ return true;
}
-
-drm_intel_bo *
-brw_upload_cache_with_auxdata(struct brw_cache *cache,
- enum brw_cache_id cache_id,
- const void *key,
- GLuint key_size,
- drm_intel_bo **reloc_bufs,
- GLuint nr_reloc_bufs,
- const void *data,
- GLuint data_size,
- const void *aux,
- GLuint aux_size,
- void *aux_return)
+static void
+brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
{
- struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
- GLuint hash;
- GLuint relocs_size = nr_reloc_bufs * sizeof(drm_intel_bo *);
- void *tmp;
- drm_intel_bo *bo;
- int i;
+ struct brw_context *brw = cache->brw;
+ struct intel_context *intel = &brw->intel;
+ drm_intel_bo *new_bo;
- item->cache_id = cache_id;
- item->key = key;
- item->key_size = key_size;
- item->reloc_bufs = reloc_bufs;
- item->nr_reloc_bufs = nr_reloc_bufs;
- hash = hash_key(item);
- item->hash = hash;
+ new_bo = drm_intel_bo_alloc(intel->bufmgr, "program cache", new_size, 64);
- /* Create the buffer object to contain the data */
- bo = drm_intel_bo_alloc(cache->brw->intel.bufmgr,
- cache->name[cache_id], data_size, 1 << 6);
+ /* Copy any existing data that needs to be saved. */
+ if (cache->next_offset != 0) {
+ drm_intel_bo_map(cache->bo, false);
+ drm_intel_bo_subdata(new_bo, 0, cache->next_offset, cache->bo->virtual);
+ drm_intel_bo_unmap(cache->bo);
+ }
+ drm_intel_bo_unreference(cache->bo);
+ cache->bo = new_bo;
+ cache->bo_used_by_gpu = false;
- /* Set up the memory containing the key, aux_data, and reloc_bufs */
- tmp = malloc(key_size + aux_size + relocs_size);
+ /* Since we have a new BO in place, we need to signal the units
+ * that depend on it (state base address on gen5+, or unit state before).
+ */
+ brw->state.dirty.brw |= BRW_NEW_PROGRAM_CACHE;
+}
- memcpy(tmp, key, key_size);
- memcpy(tmp + key_size, aux, aux_size);
- memcpy(tmp + key_size + aux_size, reloc_bufs, relocs_size);
- for (i = 0; i < nr_reloc_bufs; i++) {
- if (reloc_bufs[i] != NULL)
- drm_intel_bo_reference(reloc_bufs[i]);
- }
+/**
+ * Attempts to find an item in the cache with identical data and aux
+ * data to use
+ */
+static bool
+brw_try_upload_using_copy(struct brw_cache *cache,
+ struct brw_cache_item *result_item,
+ const void *data,
+ const void *aux)
+{
+ int i;
+ struct brw_cache_item *item;
- item->key = tmp;
- item->reloc_bufs = tmp + key_size + aux_size;
+ for (i = 0; i < cache->size; i++) {
+ for (item = cache->items[i]; item; item = item->next) {
+ const void *item_aux = item->key + item->key_size;
+ int ret;
+
+ if (item->cache_id != result_item->cache_id ||
+ item->size != result_item->size ||
+ item->aux_size != result_item->aux_size) {
+ continue;
+ }
+
+ if (cache->aux_compare[result_item->cache_id]) {
+ if (!cache->aux_compare[result_item->cache_id](item_aux, aux,
+ item->aux_size,
+ item->key))
+ continue;
+ } else if (memcmp(item_aux, aux, item->aux_size) != 0) {
+ continue;
+ }
+
+ drm_intel_bo_map(cache->bo, false);
+ ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
+ drm_intel_bo_unmap(cache->bo);
+ if (ret)
+ continue;
+
+ result_item->offset = item->offset;
+
+ return true;
+ }
+ }
- item->bo = bo;
- drm_intel_bo_reference(bo);
+ return false;
+}
- if (cache->n_items > cache->size * 1.5)
- rehash(cache);
+static void
+brw_upload_item_data(struct brw_cache *cache,
+ struct brw_cache_item *item,
+ const void *data)
+{
+ /* Allocate space in the cache BO for our new program. */
+ if (cache->next_offset + item->size > cache->bo->size) {
+ uint32_t new_size = cache->bo->size * 2;
- hash %= cache->size;
- item->next = cache->items[hash];
- cache->items[hash] = item;
- cache->n_items++;
+ while (cache->next_offset + item->size > new_size)
+ new_size *= 2;
- if (aux_return) {
- *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+ brw_cache_new_bo(cache, new_size);
}
- DBG("upload %s: %d bytes to cache id %d\n",
- cache->name[cache_id],
- data_size, cache_id);
-
- /* Copy data to the buffer */
- drm_intel_bo_subdata(bo, 0, data_size, data);
+ /* If we would block on writing to an in-use program BO, just
+ * recreate it.
+ */
+ if (cache->bo_used_by_gpu) {
+ brw_cache_new_bo(cache, cache->bo->size);
+ }
- update_cache_last(cache, cache_id, bo);
+ item->offset = cache->next_offset;
- return bo;
+ /* Programs are always 64-byte aligned, so set up the next one now */
+ cache->next_offset = ALIGN(item->offset + item->size, 64);
}
-drm_intel_bo *
+void
brw_upload_cache(struct brw_cache *cache,
enum brw_cache_id cache_id,
const void *key,
GLuint key_size,
- drm_intel_bo **reloc_bufs,
- GLuint nr_reloc_bufs,
const void *data,
- GLuint data_size)
-{
- return brw_upload_cache_with_auxdata(cache, cache_id,
- key, key_size,
- reloc_bufs, nr_reloc_bufs,
- data, data_size,
- NULL, 0,
- NULL);
-}
-
-/**
- * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
- *
- * If nr_reloc_bufs is nonzero, brw_search_cache()/brw_upload_cache() would be
- * better to use, as the potentially changing offsets in the data-used-as-key
- * will result in excessive cache misses.
- *
- * If aux data is involved, use search/upload instead.
-
- */
-drm_intel_bo *
-brw_cache_data(struct brw_cache *cache,
- enum brw_cache_id cache_id,
- const void *data,
- GLuint data_size)
+ GLuint data_size,
+ const void *aux,
+ GLuint aux_size,
+ uint32_t *out_offset,
+ void *out_aux)
{
- drm_intel_bo *bo;
- struct brw_cache_item *item, lookup;
+ struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
GLuint hash;
+ void *tmp;
- lookup.cache_id = cache_id;
- lookup.key = data;
- lookup.key_size = data_size;
- lookup.reloc_bufs = NULL;
- lookup.nr_reloc_bufs = 0;
- hash = hash_key(&lookup);
- lookup.hash = hash;
+ item->cache_id = cache_id;
+ item->size = data_size;
+ item->key = key;
+ item->key_size = key_size;
+ item->aux_size = aux_size;
+ hash = hash_key(item);
+ item->hash = hash;
- item = search_cache(cache, hash, &lookup);
- if (item) {
- update_cache_last(cache, cache_id, item->bo);
- drm_intel_bo_reference(item->bo);
- return item->bo;
+ /* If we can find a matching prog/prog_data combo in the cache
+ * already, then reuse the existing stuff. This will mean not
+ * flagging CACHE_NEW_* when transitioning between the two
+ * equivalent hash keys. This is notably useful for programs
+ * generating shaders at runtime, where multiple shaders may
+ * compile to the thing in our backend.
+ */
+ if (!brw_try_upload_using_copy(cache, item, data, aux)) {
+ brw_upload_item_data(cache, item, data);
}
- bo = brw_upload_cache(cache, cache_id,
- data, data_size,
- NULL, 0,
- data, data_size);
+ /* Set up the memory containing the key and aux_data */
+ tmp = malloc(key_size + aux_size);
- return bo;
-}
+ memcpy(tmp, key, key_size);
+ memcpy(tmp + key_size, aux, aux_size);
-enum pool_type {
- DW_SURFACE_STATE,
- DW_GENERAL_STATE
-};
+ item->key = tmp;
+ if (cache->n_items > cache->size * 1.5)
+ rehash(cache);
-static void
-brw_init_cache_id(struct brw_cache *cache,
- const char *name,
- enum brw_cache_id id)
-{
- cache->name[id] = strdup(name);
-}
+ hash %= cache->size;
+ item->next = cache->items[hash];
+ cache->items[hash] = item;
+ cache->n_items++;
+ /* Copy data to the buffer */
+ drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
-static void
-brw_init_non_surface_cache(struct brw_context *brw)
+ *out_offset = item->offset;
+ *(void **)out_aux = (void *)((char *)item->key + item->key_size);
+ cache->brw->state.dirty.cache |= 1 << cache_id;
+}
+
+void
+brw_init_caches(struct brw_context *brw)
{
+ struct intel_context *intel = &brw->intel;
struct brw_cache *cache = &brw->cache;
cache->brw = brw;
cache->size = 7;
cache->n_items = 0;
- cache->items = (struct brw_cache_item **)
- calloc(1, cache->size * sizeof(struct brw_cache_item));
-
- brw_init_cache_id(cache, "CC_VP", BRW_CC_VP);
- brw_init_cache_id(cache, "CC_UNIT", BRW_CC_UNIT);
- brw_init_cache_id(cache, "WM_PROG", BRW_WM_PROG);
- brw_init_cache_id(cache, "SAMPLER_DEFAULT_COLOR", BRW_SAMPLER_DEFAULT_COLOR);
- brw_init_cache_id(cache, "SAMPLER", BRW_SAMPLER);
- brw_init_cache_id(cache, "WM_UNIT", BRW_WM_UNIT);
- brw_init_cache_id(cache, "SF_PROG", BRW_SF_PROG);
- brw_init_cache_id(cache, "SF_VP", BRW_SF_VP);
-
- brw_init_cache_id(cache, "SF_UNIT", BRW_SF_UNIT);
-
- brw_init_cache_id(cache, "VS_UNIT", BRW_VS_UNIT);
-
- brw_init_cache_id(cache, "VS_PROG", BRW_VS_PROG);
-
- brw_init_cache_id(cache, "CLIP_UNIT", BRW_CLIP_UNIT);
-
- brw_init_cache_id(cache, "CLIP_PROG", BRW_CLIP_PROG);
- brw_init_cache_id(cache, "CLIP_VP", BRW_CLIP_VP);
+ cache->items =
+ calloc(1, cache->size * sizeof(struct brw_cache_item *));
- brw_init_cache_id(cache, "GS_UNIT", BRW_GS_UNIT);
+ cache->bo = drm_intel_bo_alloc(intel->bufmgr,
+ "program cache",
+ 4096, 64);
- brw_init_cache_id(cache, "GS_PROG", BRW_GS_PROG);
- brw_init_cache_id(cache, "BLEND_STATE", BRW_BLEND_STATE);
- brw_init_cache_id(cache, "COLOR_CALC_STATE", BRW_COLOR_CALC_STATE);
- brw_init_cache_id(cache, "DEPTH_STENCIL_STATE", BRW_DEPTH_STENCIL_STATE);
+ cache->aux_compare[BRW_VS_PROG] = brw_vs_prog_data_compare;
+ cache->aux_compare[BRW_WM_PROG] = brw_wm_prog_data_compare;
+ cache->aux_free[BRW_VS_PROG] = brw_vs_prog_data_free;
+ cache->aux_free[BRW_WM_PROG] = brw_wm_prog_data_free;
}
-void
-brw_init_caches(struct brw_context *brw)
-{
- brw_init_non_surface_cache(brw);
-}
-
-
static void
brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
{
+ struct intel_context *intel = &brw->intel;
struct brw_cache_item *c, *next;
GLuint i;
for (i = 0; i < cache->size; i++) {
for (c = cache->items[i]; c; c = next) {
- int j;
-
next = c->next;
- for (j = 0; j < c->nr_reloc_bufs; j++)
- drm_intel_bo_unreference(c->reloc_bufs[j]);
- drm_intel_bo_unreference(c->bo);
+ if (cache->aux_free[c->cache_id]) {
+ const void *item_aux = c->key + c->key_size;
+ cache->aux_free[c->cache_id](item_aux);
+ }
free((void *)c->key);
free(c);
}
cache->n_items = 0;
+ /* Start putting programs into the start of the BO again, since
+ * we'll never find the old results.
+ */
+ cache->next_offset = 0;
+
+ /* We need to make sure that the programs get regenerated, since
+ * any offsets leftover in brw_context will no longer be valid.
+ */
brw->state.dirty.mesa |= ~0;
brw->state.dirty.brw |= ~0;
brw->state.dirty.cache |= ~0;
+ intel_batchbuffer_flush(intel);
}
void
brw_state_cache_check_size(struct brw_context *brw)
{
- DBG("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
-
- /* un-tuned guess. Each object is generally a page, so 1000 of them is 4 MB of
+ /* un-tuned guess. Each object is generally a page, so 2000 of them is 8 MB of
* state cache.
*/
- if (brw->cache.n_items > 1000)
+ if (brw->cache.n_items > 2000) {
+ perf_debug("Exceeded state cache size limit. Clearing the set "
+ "of compiled programs, which will trigger recompiles\n");
brw_clear_cache(brw, &brw->cache);
+ }
}
static void
brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
{
- GLuint i;
DBG("%s\n", __FUNCTION__);
+ drm_intel_bo_unreference(cache->bo);
+ cache->bo = NULL;
brw_clear_cache(brw, cache);
- for (i = 0; i < BRW_MAX_CACHE; i++) {
- drm_intel_bo_unreference(cache->last_bo[i]);
- free(cache->name[i]);
- }
free(cache->items);
cache->items = NULL;
cache->size = 0;