* data) in return. Objects in the cache may not have relocations
* (pointers to other BOs) in them.
*
- * The inner workings are a simple hash table based on a CRC of the
+ * The inner workings are a simple hash table based on a FNV-1a of the
* key data.
*
* Replacement is not implemented. Instead, when the cache gets too
* big we throw out all of the cache data and let it get regenerated.
*/
-#include "main/imports.h"
+#include "main/streaming-load-memcpy.h"
+#include "x86/common_x86_asm.h"
#include "intel_batchbuffer.h"
#include "brw_state.h"
-#include "brw_vs.h"
#include "brw_wm.h"
#include "brw_gs.h"
#include "brw_cs.h"
#include "brw_program.h"
+#include "compiler/brw_eu.h"
+#include "util/u_memory.h"
#define FILE_DEBUG_FLAG DEBUG_STATE
-static unsigned
-get_program_string_id(enum brw_cache_id cache_id, const void *key)
+struct brw_cache_item {
+ /**
+ * Effectively part of the key, cache_id identifies what kind of state
+ * buffer is involved, and also which dirty flag should set.
+ */
+ enum brw_cache_id cache_id;
+
+ /** 32-bit hash of the key data */
+ GLuint hash;
+
+ /** for variable-sized keys */
+ GLuint key_size;
+ GLuint prog_data_size;
+ const struct brw_base_prog_key *key;
+
+ uint32_t offset;
+ uint32_t size;
+
+ struct brw_cache_item *next;
+};
+
+enum brw_cache_id
+brw_stage_cache_id(gl_shader_stage stage)
{
- switch (cache_id) {
- case BRW_CACHE_VS_PROG:
- return ((struct brw_vs_prog_key *) key)->program_string_id;
- case BRW_CACHE_TCS_PROG:
- return ((struct brw_tcs_prog_key *) key)->program_string_id;
- case BRW_CACHE_TES_PROG:
- return ((struct brw_tes_prog_key *) key)->program_string_id;
- case BRW_CACHE_GS_PROG:
- return ((struct brw_gs_prog_key *) key)->program_string_id;
- case BRW_CACHE_CS_PROG:
- return ((struct brw_cs_prog_key *) key)->program_string_id;
- case BRW_CACHE_FS_PROG:
- return ((struct brw_wm_prog_key *) key)->program_string_id;
- default:
- unreachable("no program string id for this kind of program");
- }
+ static const enum brw_cache_id stage_ids[] = {
+ BRW_CACHE_VS_PROG,
+ BRW_CACHE_TCS_PROG,
+ BRW_CACHE_TES_PROG,
+ BRW_CACHE_GS_PROG,
+ BRW_CACHE_FS_PROG,
+ BRW_CACHE_CS_PROG,
+ };
+ assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_ids));
+ return stage_ids[stage];
}
static GLuint
hash_key(struct brw_cache_item *item)
{
- GLuint *ikey = (GLuint *)item->key;
- GLuint hash = item->cache_id, i;
-
- assert(item->key_size % 4 == 0);
-
- /* I'm sure this can be improved on:
- */
- for (i = 0; i < item->key_size/4; i++) {
- hash ^= ikey[i];
- hash = (hash << 5) | (hash >> 27);
- }
+ uint32_t hash = _mesa_fnv32_1a_offset_bias;
+ hash = _mesa_fnv32_1a_accumulate(hash, item->cache_id);
+ hash = _mesa_fnv32_1a_accumulate_block(hash, item->key, item->key_size);
return hash;
}
* Returns the buffer object matching cache_id and key, or NULL.
*/
bool
-brw_search_cache(struct brw_cache *cache,
- enum brw_cache_id cache_id,
- const void *key, GLuint key_size,
- uint32_t *inout_offset, void *inout_aux)
+brw_search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
+ const void *key, GLuint key_size, uint32_t *inout_offset,
+ void *inout_prog_data, bool flag_state)
{
- struct brw_context *brw = cache->brw;
struct brw_cache_item *item;
struct brw_cache_item lookup;
GLuint hash;
if (item == NULL)
return false;
- void *aux = ((char *) item->key) + item->key_size;
+ void *prog_data = ((char *) item->key) + item->key_size;
- if (item->offset != *inout_offset || aux != *((void **) inout_aux)) {
- brw->ctx.NewDriverState |= (1 << cache_id);
+ if (item->offset != *inout_offset ||
+ prog_data != *((void **) inout_prog_data)) {
+ if (likely(flag_state))
+ cache->brw->ctx.NewDriverState |= (1 << cache_id);
*inout_offset = item->offset;
- *((void **) inout_aux) = aux;
+ *((void **) inout_prog_data) = prog_data;
}
return true;
brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
{
struct brw_context *brw = cache->brw;
- drm_intel_bo *new_bo;
+ struct brw_bo *new_bo;
- new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
- if (brw->has_llc)
- drm_intel_gem_bo_map_unsynchronized(new_bo);
+ perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
+ (unsigned) cache->bo->size / 1024, new_size / 1024);
+
+ new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size,
+ BRW_MEMZONE_SHADER);
+ if (can_do_exec_capture(brw->screen))
+ new_bo->kflags |= EXEC_OBJECT_CAPTURE;
+
+ void *map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
+ MAP_ASYNC | MAP_PERSISTENT);
/* Copy any existing data that needs to be saved. */
if (cache->next_offset != 0) {
- if (brw->has_llc) {
- memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
- } else {
- drm_intel_bo_map(cache->bo, false);
- drm_intel_bo_subdata(new_bo, 0, cache->next_offset,
- cache->bo->virtual);
- drm_intel_bo_unmap(cache->bo);
- }
+#ifdef USE_SSE41
+ if (!cache->bo->cache_coherent && cpu_has_sse4_1)
+ _mesa_streaming_load_memcpy(map, cache->map, cache->next_offset);
+ else
+#endif
+ memcpy(map, cache->map, cache->next_offset);
}
- if (brw->has_llc)
- drm_intel_bo_unmap(cache->bo);
- drm_intel_bo_unreference(cache->bo);
+ brw_bo_unmap(cache->bo);
+ brw_bo_unreference(cache->bo);
cache->bo = new_bo;
- cache->bo_used_by_gpu = false;
+ cache->map = map;
/* Since we have a new BO in place, we need to signal the units
* that depend on it (state base address on gen5+, or unit state before).
enum brw_cache_id cache_id,
const void *data, unsigned data_size)
{
- const struct brw_context *brw = cache->brw;
unsigned i;
const struct brw_cache_item *item;
for (i = 0; i < cache->size; i++) {
for (item = cache->items[i]; item; item = item->next) {
- int ret;
-
- if (item->cache_id != cache_id || item->size != data_size)
- continue;
-
- if (!brw->has_llc)
- drm_intel_bo_map(cache->bo, false);
- ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
- if (!brw->has_llc)
- drm_intel_bo_unmap(cache->bo);
- if (ret)
+ if (item->cache_id != cache_id || item->size != data_size ||
+ memcmp(cache->map + item->offset, data, item->size) != 0)
continue;
return item;
brw_alloc_item_data(struct brw_cache *cache, uint32_t size)
{
uint32_t offset;
- struct brw_context *brw = cache->brw;
/* Allocate space in the cache BO for our new program. */
if (cache->next_offset + size > cache->bo->size) {
brw_cache_new_bo(cache, new_size);
}
- /* If we would block on writing to an in-use program BO, just
- * recreate it.
- */
- if (!brw->has_llc && cache->bo_used_by_gpu) {
- perf_debug("Copying busy program cache buffer.\n");
- brw_cache_new_bo(cache, cache->bo->size);
- }
-
offset = cache->next_offset;
/* Programs are always 64-byte aligned, so set up the next one now */
for (unsigned i = 0; i < cache->size; i++) {
for (struct brw_cache_item *c = cache->items[i]; c; c = c->next) {
if (c->cache_id == cache_id &&
- get_program_string_id(cache_id, c->key) == program_string_id) {
+ c->key->program_string_id == program_string_id) {
return c->key;
}
}
GLuint key_size,
const void *data,
GLuint data_size,
- const void *aux,
- GLuint aux_size,
+ const void *prog_data,
+ GLuint prog_data_size,
uint32_t *out_offset,
- void *out_aux)
+ void *out_prog_data)
{
- struct brw_context *brw = cache->brw;
struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
const struct brw_cache_item *matching_data =
brw_lookup_prog(cache, cache_id, data, data_size);
item->size = data_size;
item->key = key;
item->key_size = key_size;
- item->aux_size = aux_size;
+ item->prog_data_size = prog_data_size;
hash = hash_key(item);
item->hash = hash;
item->offset = brw_alloc_item_data(cache, data_size);
/* Copy data to the buffer */
- if (brw->has_llc) {
- memcpy((char *)cache->bo->virtual + item->offset, data, data_size);
- } else {
- drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
- }
+ memcpy(cache->map + item->offset, data, data_size);
}
- /* Set up the memory containing the key and aux_data */
- tmp = malloc(key_size + aux_size);
+ /* Set up the memory containing the key and prog_data */
+ tmp = malloc(key_size + prog_data_size);
memcpy(tmp, key, key_size);
- memcpy(tmp + key_size, aux, aux_size);
+ memcpy(tmp + key_size, prog_data, prog_data_size);
item->key = tmp;
cache->n_items++;
*out_offset = item->offset;
- *(void **)out_aux = (void *)((char *)item->key + item->key_size);
+ *(void **)out_prog_data = (void *)((char *)item->key + item->key_size);
cache->brw->ctx.NewDriverState |= 1 << cache_id;
}
cache->items =
calloc(cache->size, sizeof(struct brw_cache_item *));
- cache->bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", 4096, 64);
- if (brw->has_llc)
- drm_intel_gem_bo_map_unsynchronized(cache->bo);
+ cache->bo = brw_bo_alloc(brw->bufmgr, "program cache", 16384,
+ BRW_MEMZONE_SHADER);
+ if (can_do_exec_capture(brw->screen))
+ cache->bo->kflags |= EXEC_OBJECT_CAPTURE;
+
+ cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
+ MAP_ASYNC | MAP_PERSISTENT);
}
static void
c->cache_id == BRW_CACHE_GS_PROG ||
c->cache_id == BRW_CACHE_FS_PROG ||
c->cache_id == BRW_CACHE_CS_PROG) {
- const void *item_aux = c->key + c->key_size;
- brw_stage_prog_data_free(item_aux);
+ const void *item_prog_data = ((char *)c->key) + c->key_size;
+ brw_stage_prog_data_free(item_prog_data);
}
free((void *)c->key);
free(c);
perf_debug("Exceeded state cache size limit. Clearing the set "
"of compiled programs, which will trigger recompiles\n");
brw_clear_cache(brw, &brw->cache);
+ brw_cache_new_bo(&brw->cache, brw->cache.bo->size);
}
}
DBG("%s\n", __func__);
- if (brw->has_llc)
- drm_intel_bo_unmap(cache->bo);
- drm_intel_bo_unreference(cache->bo);
- cache->bo = NULL;
+ /* This can be NULL if context creation failed early on */
+ if (cache->bo) {
+ brw_bo_unmap(cache->bo);
+ brw_bo_unreference(cache->bo);
+ cache->bo = NULL;
+ cache->map = NULL;
+ }
brw_clear_cache(brw, cache);
free(cache->items);
cache->items = NULL;
{
brw_destroy_cache(brw, &brw->cache);
}
+
+static const char *
+cache_name(enum brw_cache_id cache_id)
+{
+ switch (cache_id) {
+ case BRW_CACHE_VS_PROG:
+ return "VS kernel";
+ case BRW_CACHE_TCS_PROG:
+ return "TCS kernel";
+ case BRW_CACHE_TES_PROG:
+ return "TES kernel";
+ case BRW_CACHE_FF_GS_PROG:
+ return "Fixed-function GS kernel";
+ case BRW_CACHE_GS_PROG:
+ return "GS kernel";
+ case BRW_CACHE_CLIP_PROG:
+ return "CLIP kernel";
+ case BRW_CACHE_SF_PROG:
+ return "SF kernel";
+ case BRW_CACHE_FS_PROG:
+ return "FS kernel";
+ case BRW_CACHE_CS_PROG:
+ return "CS kernel";
+ default:
+ return "unknown";
+ }
+}
+
+void
+brw_print_program_cache(struct brw_context *brw)
+{
+ const struct brw_cache *cache = &brw->cache;
+ struct brw_cache_item *item;
+
+ for (unsigned i = 0; i < cache->size; i++) {
+ for (item = cache->items[i]; item; item = item->next) {
+ fprintf(stderr, "%s:\n", cache_name(i));
+ brw_disassemble(&brw->screen->devinfo, cache->map,
+ item->offset, item->size, stderr);
+ }
+ }
+}