X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fmesa%2Fdrivers%2Fdri%2Fi965%2Fbrw_state_cache.c;h=0e98e654c1c7c67e41afc2de4a59999c5f701d93;hb=22d9a4824baf0bf89bb8e39025ad01fecb213888;hp=bb5047ea4d6b6bf4a5508a1f62e5b14d52a4672d;hpb=c35f14f36880eb20f5e54480444e343520e9bec5;p=mesa.git diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c index bb5047ea4d6..0e98e654c1c 100644 --- a/src/mesa/drivers/dri/i965/brw_state_cache.c +++ b/src/mesa/drivers/dri/i965/brw_state_cache.c @@ -49,8 +49,9 @@ #include "brw_state.h" #include "brw_vs.h" #include "brw_wm.h" -#include "brw_vs.h" -#include "brw_vec4_gs.h" +#include "brw_gs.h" +#include "brw_cs.h" +#include "brw_program.h" #define FILE_DEBUG_FLAG DEBUG_STATE @@ -137,7 +138,7 @@ bool brw_search_cache(struct brw_cache *cache, enum brw_cache_id cache_id, const void *key, GLuint key_size, - uint32_t *inout_offset, void *out_aux) + uint32_t *inout_offset, void *inout_aux) { struct brw_context *brw = cache->brw; struct brw_cache_item *item; @@ -155,11 +156,12 @@ brw_search_cache(struct brw_cache *cache, if (item == NULL) return false; - *(void **)out_aux = ((char *)item->key + item->key_size); + void *aux = ((char *) item->key) + item->key_size; - if (item->offset != *inout_offset) { - SET_DIRTY_BIT(cache, 1 << cache_id); + if (item->offset != *inout_offset || aux != *((void **) inout_aux)) { + brw->ctx.NewDriverState |= (1 << cache_id); *inout_offset = item->offset; + *((void **) inout_aux) = aux; } return true; @@ -172,14 +174,23 @@ brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size) drm_intel_bo *new_bo; new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64); + if (brw->has_llc) + drm_intel_gem_bo_map_unsynchronized(new_bo); /* Copy any existing data that needs to be saved. */ if (cache->next_offset != 0) { - drm_intel_bo_map(cache->bo, false); - drm_intel_bo_subdata(new_bo, 0, cache->next_offset, cache->bo->virtual); - drm_intel_bo_unmap(cache->bo); + if (brw->has_llc) { + memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset); + } else { + drm_intel_bo_map(cache->bo, false); + drm_intel_bo_subdata(new_bo, 0, cache->next_offset, + cache->bo->virtual); + drm_intel_bo_unmap(cache->bo); + } } + if (brw->has_llc) + drm_intel_bo_unmap(cache->bo); drm_intel_bo_unreference(cache->bo); cache->bo = new_bo; cache->bo_used_by_gpu = false; @@ -187,65 +198,55 @@ brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size) /* Since we have a new BO in place, we need to signal the units * that depend on it (state base address on gen5+, or unit state before). */ - SET_DIRTY_BIT(brw, BRW_NEW_PROGRAM_CACHE); + brw->ctx.NewDriverState |= BRW_NEW_PROGRAM_CACHE; + brw->batch.state_base_address_emitted = false; } /** - * Attempts to find an item in the cache with identical data and aux - * data to use + * Attempts to find an item in the cache with identical data. */ -static bool -brw_try_upload_using_copy(struct brw_cache *cache, - struct brw_cache_item *result_item, - const void *data, - const void *aux) +static const struct brw_cache_item * +brw_lookup_prog(const struct brw_cache *cache, + enum brw_cache_id cache_id, + const void *data, unsigned data_size) { - int i; - struct brw_cache_item *item; + const struct brw_context *brw = cache->brw; + unsigned i; + const struct brw_cache_item *item; for (i = 0; i < cache->size; i++) { for (item = cache->items[i]; item; item = item->next) { - const void *item_aux = item->key + item->key_size; int ret; - if (item->cache_id != result_item->cache_id || - item->size != result_item->size || - item->aux_size != result_item->aux_size) { + if (item->cache_id != cache_id || item->size != data_size) continue; - } - if (cache->aux_compare[result_item->cache_id]) { - if (!cache->aux_compare[result_item->cache_id](item_aux, aux)) - continue; - } else if (memcmp(item_aux, aux, item->aux_size) != 0) { - continue; - } - - drm_intel_bo_map(cache->bo, false); + if (!brw->has_llc) + drm_intel_bo_map(cache->bo, false); ret = memcmp(cache->bo->virtual + item->offset, data, item->size); - drm_intel_bo_unmap(cache->bo); + if (!brw->has_llc) + drm_intel_bo_unmap(cache->bo); if (ret) continue; - result_item->offset = item->offset; - - return true; + return item; } } - return false; + return NULL; } -static void -brw_upload_item_data(struct brw_cache *cache, - struct brw_cache_item *item, - const void *data) +static uint32_t +brw_alloc_item_data(struct brw_cache *cache, uint32_t size) { + uint32_t offset; + struct brw_context *brw = cache->brw; + /* Allocate space in the cache BO for our new program. */ - if (cache->next_offset + item->size > cache->bo->size) { + if (cache->next_offset + size > cache->bo->size) { uint32_t new_size = cache->bo->size * 2; - while (cache->next_offset + item->size > new_size) + while (cache->next_offset + size > new_size) new_size *= 2; brw_cache_new_bo(cache, new_size); @@ -254,14 +255,17 @@ brw_upload_item_data(struct brw_cache *cache, /* If we would block on writing to an in-use program BO, just * recreate it. */ - if (cache->bo_used_by_gpu) { + if (!brw->has_llc && cache->bo_used_by_gpu) { + perf_debug("Copying busy program cache buffer.\n"); brw_cache_new_bo(cache, cache->bo->size); } - item->offset = cache->next_offset; + offset = cache->next_offset; /* Programs are always 64-byte aligned, so set up the next one now */ - cache->next_offset = ALIGN(item->offset + item->size, 64); + cache->next_offset = ALIGN(offset + size, 64); + + return offset; } void @@ -278,6 +282,8 @@ brw_upload_cache(struct brw_cache *cache, { struct brw_context *brw = cache->brw; struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item); + const struct brw_cache_item *matching_data = + brw_lookup_prog(cache, cache_id, data, data_size); GLuint hash; void *tmp; @@ -289,15 +295,23 @@ brw_upload_cache(struct brw_cache *cache, hash = hash_key(item); item->hash = hash; - /* If we can find a matching prog/prog_data combo in the cache - * already, then reuse the existing stuff. This will mean not - * flagging CACHE_NEW_* when transitioning between the two - * equivalent hash keys. This is notably useful for programs - * generating shaders at runtime, where multiple shaders may - * compile to the thing in our backend. + /* If we can find a matching prog in the cache already, then reuse the + * existing stuff without creating new copy into the underlying buffer + * object. This is notably useful for programs generating shaders at + * runtime, where multiple shaders may compile to the same thing in our + * backend. */ - if (!brw_try_upload_using_copy(cache, item, data, aux)) { - brw_upload_item_data(cache, item, data); + if (matching_data) { + item->offset = matching_data->offset; + } else { + item->offset = brw_alloc_item_data(cache, data_size); + + /* Copy data to the buffer */ + if (brw->has_llc) { + memcpy((char *)cache->bo->virtual + item->offset, data, data_size); + } else { + drm_intel_bo_subdata(cache->bo, item->offset, data_size, data); + } } /* Set up the memory containing the key and aux_data */ @@ -308,7 +322,7 @@ brw_upload_cache(struct brw_cache *cache, item->key = tmp; - if (cache->n_items > cache->size * 1.5) + if (cache->n_items > cache->size * 1.5f) rehash(cache); hash %= cache->size; @@ -316,12 +330,9 @@ brw_upload_cache(struct brw_cache *cache, cache->items[hash] = item; cache->n_items++; - /* Copy data to the buffer */ - drm_intel_bo_subdata(cache->bo, item->offset, data_size, data); - *out_offset = item->offset; *(void **)out_aux = (void *)((char *)item->key + item->key_size); - SET_DIRTY_BIT(cache, 1 << cache_id); + cache->brw->ctx.NewDriverState |= 1 << cache_id; } void @@ -339,13 +350,8 @@ brw_init_caches(struct brw_context *brw) cache->bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", 4096, 64); - - cache->aux_compare[BRW_VS_PROG] = brw_vs_prog_data_compare; - cache->aux_compare[BRW_GS_PROG] = brw_gs_prog_data_compare; - cache->aux_compare[BRW_WM_PROG] = brw_wm_prog_data_compare; - cache->aux_free[BRW_VS_PROG] = brw_stage_prog_data_free; - cache->aux_free[BRW_GS_PROG] = brw_stage_prog_data_free; - cache->aux_free[BRW_WM_PROG] = brw_stage_prog_data_free; + if (brw->has_llc) + drm_intel_gem_bo_map_unsynchronized(cache->bo); } static void @@ -354,14 +360,17 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache) struct brw_cache_item *c, *next; GLuint i; - DBG("%s\n", __FUNCTION__); + DBG("%s\n", __func__); for (i = 0; i < cache->size; i++) { for (c = cache->items[i]; c; c = next) { next = c->next; - if (cache->aux_free[c->cache_id]) { + if (c->cache_id == BRW_CACHE_VS_PROG || + c->cache_id == BRW_CACHE_GS_PROG || + c->cache_id == BRW_CACHE_FS_PROG || + c->cache_id == BRW_CACHE_CS_PROG) { const void *item_aux = c->key + c->key_size; - cache->aux_free[c->cache_id](item_aux); + brw_stage_prog_data_free(item_aux); } free((void *)c->key); free(c); @@ -379,9 +388,27 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache) /* We need to make sure that the programs get regenerated, since * any offsets leftover in brw_context will no longer be valid. */ - SET_DIRTY_ALL(mesa); - SET_DIRTY64_ALL(brw); - SET_DIRTY_ALL(cache); + brw->NewGLState = ~0; + brw->ctx.NewDriverState = ~0ull; + brw->state.pipelines[BRW_RENDER_PIPELINE].mesa = ~0; + brw->state.pipelines[BRW_RENDER_PIPELINE].brw = ~0ull; + brw->state.pipelines[BRW_COMPUTE_PIPELINE].mesa = ~0; + brw->state.pipelines[BRW_COMPUTE_PIPELINE].brw = ~0ull; + + /* Also, NULL out any stale program pointers. */ + brw->vs.prog_data = NULL; + brw->vs.base.prog_data = NULL; + brw->tcs.prog_data = NULL; + brw->tcs.base.prog_data = NULL; + brw->tes.prog_data = NULL; + brw->tes.base.prog_data = NULL; + brw->gs.prog_data = NULL; + brw->gs.base.prog_data = NULL; + brw->wm.prog_data = NULL; + brw->wm.base.prog_data = NULL; + brw->cs.prog_data = NULL; + brw->cs.base.prog_data = NULL; + intel_batchbuffer_flush(brw); } @@ -403,8 +430,10 @@ static void brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache) { - DBG("%s\n", __FUNCTION__); + DBG("%s\n", __func__); + if (brw->has_llc) + drm_intel_bo_unmap(cache->bo); drm_intel_bo_unreference(cache->bo); cache->bo = NULL; brw_clear_cache(brw, cache);